Repository: apache/seatunnel Branch: dev Commit: 219c2acec365 Files: 6942 Total size: 35.5 MB Directory structure: gitextract_vzl20rdb/ ├── .asf.yaml ├── .dlc.json ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.yml │ │ ├── feature-request.yml │ │ └── umbrella.yml │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── add-label.yml │ ├── approve-label-trigger.yml │ ├── approve-label.yml │ ├── backend.yml │ ├── build_main.yml │ ├── codeql.yaml │ ├── labeler/ │ │ └── label-scope-conf.yml │ ├── notify_test_workflow.yml │ ├── publish-docker.yaml │ ├── publish-helm-chart.yaml │ ├── schedule_backend.yml │ ├── stale.yml │ └── update_build_status.yml ├── .gitignore ├── .gitmodules ├── .licenserc.yaml ├── .mvn/ │ └── wrapper/ │ └── maven-wrapper.properties ├── AGENTS.md ├── LICENSE ├── NOTICE ├── README.md ├── bin/ │ ├── install-plugin.cmd │ └── install-plugin.sh ├── config/ │ ├── hazelcast-client.yaml │ ├── hazelcast-master.yaml │ ├── hazelcast-worker.yaml │ ├── hazelcast.yaml │ ├── jvm_client_options │ ├── jvm_master_options │ ├── jvm_options │ ├── jvm_worker_options │ ├── log4j2.properties │ ├── log4j2_client.properties │ ├── plugin_config │ ├── seatunnel-env.cmd │ ├── seatunnel-env.sh │ ├── seatunnel.yaml │ ├── v2.batch.config.template │ └── v2.streaming.conf.template ├── deploy/ │ └── kubernetes/ │ └── seatunnel/ │ ├── Chart.yaml │ ├── conf/ │ │ ├── hazelcast-client.yaml │ │ ├── hazelcast-master.yaml │ │ ├── hazelcast-worker.yaml │ │ ├── jvm_client_options │ │ ├── jvm_master_options │ │ ├── jvm_worker_options │ │ ├── log4j2.properties │ │ └── seatunnel.yaml │ ├── templates/ │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── configmap.yaml │ │ ├── deployment-seatunnel-master.yaml │ │ ├── deployment-seatunnel-worker.yaml │ │ ├── ingress.yaml │ │ ├── rbac.yaml │ │ ├── service-headless.yaml │ │ └── service-master-headless.yaml │ └── values.yaml ├── docs/ │ ├── en/ │ │ ├── architecture/ │ │ │ ├── api-design/ │ │ │ │ ├── catalog-table.md │ │ │ │ ├── sink-architecture.md │ │ │ │ ├── source-architecture.md │ │ │ │ └── translation-layer.md │ │ │ ├── design-philosophy.md │ │ │ ├── engine/ │ │ │ │ ├── dag-execution.md │ │ │ │ ├── engine-architecture.md │ │ │ │ └── resource-management.md │ │ │ ├── fault-tolerance/ │ │ │ │ ├── checkpoint-mechanism.md │ │ │ │ └── exactly-once.md │ │ │ ├── features/ │ │ │ │ └── multi-table.md │ │ │ └── overview.md │ │ ├── connectors/ │ │ │ ├── changelog/ │ │ │ │ ├── connector-activemq.md │ │ │ │ ├── connector-aerospike.md │ │ │ │ ├── connector-amazondynamodb.md │ │ │ │ ├── connector-amazonsqs.md │ │ │ │ ├── connector-assert.md │ │ │ │ ├── connector-cassandra.md │ │ │ │ ├── connector-cdc-base.md │ │ │ │ ├── connector-cdc-mongodb.md │ │ │ │ ├── connector-cdc-mysql.md │ │ │ │ ├── connector-cdc-opengauss.md │ │ │ │ ├── connector-cdc-oracle.md │ │ │ │ ├── connector-cdc-postgres.md │ │ │ │ ├── connector-cdc-sqlserver.md │ │ │ │ ├── connector-cdc-tidb.md │ │ │ │ ├── connector-cdc.md │ │ │ │ ├── connector-clickhouse.md │ │ │ │ ├── connector-cloudberry.md │ │ │ │ ├── connector-common.md │ │ │ │ ├── connector-console.md │ │ │ │ ├── connector-databend.md │ │ │ │ ├── connector-datahub.md │ │ │ │ ├── connector-dingtalk.md │ │ │ │ ├── connector-doris.md │ │ │ │ ├── connector-druid.md │ │ │ │ ├── connector-easysearch.md │ │ │ │ ├── connector-elasticsearch.md │ │ │ │ ├── connector-email.md │ │ │ │ ├── connector-fake.md │ │ │ │ ├── connector-file-base-hadoop.md │ │ │ │ ├── connector-file-base.md │ │ │ │ ├── connector-file-cos.md │ │ │ │ ├── connector-file-ftp.md │ │ │ │ ├── connector-file-hadoop.md │ │ │ │ ├── connector-file-jindo-oss.md │ │ │ │ ├── connector-file-local.md │ │ │ │ ├── connector-file-obs.md │ │ │ │ ├── connector-file-oss-jindo.md │ │ │ │ ├── connector-file-oss.md │ │ │ │ ├── connector-file-s3.md │ │ │ │ ├── connector-file-sftp.md │ │ │ │ ├── connector-file.md │ │ │ │ ├── connector-fluss.md │ │ │ │ ├── connector-google-firestore.md │ │ │ │ ├── connector-google-sheets.md │ │ │ │ ├── connector-graphql.md │ │ │ │ ├── connector-hbase.md │ │ │ │ ├── connector-hive.md │ │ │ │ ├── connector-http-airtable.md │ │ │ │ ├── connector-http-base.md │ │ │ │ ├── connector-http-feishu.md │ │ │ │ ├── connector-http-github.md │ │ │ │ ├── connector-http-gitlab.md │ │ │ │ ├── connector-http-jira.md │ │ │ │ ├── connector-http-klaviyo.md │ │ │ │ ├── connector-http-lemlist.md │ │ │ │ ├── connector-http-myhours.md │ │ │ │ ├── connector-http-notion.md │ │ │ │ ├── connector-http-onesignal.md │ │ │ │ ├── connector-http-persistiq.md │ │ │ │ ├── connector-http-wechat.md │ │ │ │ ├── connector-http.md │ │ │ │ ├── connector-hudi.md │ │ │ │ ├── connector-hugegraph.md │ │ │ │ ├── connector-iceberg.md │ │ │ │ ├── connector-influxdb.md │ │ │ │ ├── connector-iotdb.md │ │ │ │ ├── connector-jdbc.md │ │ │ │ ├── connector-kafka.md │ │ │ │ ├── connector-kudu.md │ │ │ │ ├── connector-lance.md │ │ │ │ ├── connector-maxcompute.md │ │ │ │ ├── connector-milvus.md │ │ │ │ ├── connector-mongodb.md │ │ │ │ ├── connector-neo4j.md │ │ │ │ ├── connector-openmldb.md │ │ │ │ ├── connector-paimon.md │ │ │ │ ├── connector-prometheus.md │ │ │ │ ├── connector-pulsar.md │ │ │ │ ├── connector-qdrant.md │ │ │ │ ├── connector-rabbitmq.md │ │ │ │ ├── connector-redis.md │ │ │ │ ├── connector-rocketmq.md │ │ │ │ ├── connector-s3-redshift.md │ │ │ │ ├── connector-selectdb-cloud.md │ │ │ │ ├── connector-sensorsdata.md │ │ │ │ ├── connector-sentry.md │ │ │ │ ├── connector-slack.md │ │ │ │ ├── connector-sls.md │ │ │ │ ├── connector-socket.md │ │ │ │ ├── connector-starrocks.md │ │ │ │ ├── connector-tablestore.md │ │ │ │ ├── connector-tdengine.md │ │ │ │ ├── connector-typesense.md │ │ │ │ └── connector-web3j.md │ │ │ ├── common-options/ │ │ │ │ ├── sink-common-options.md │ │ │ │ └── source-common-options.md │ │ │ ├── connector-isolated-dependency.md │ │ │ ├── formats/ │ │ │ │ ├── avro.md │ │ │ │ ├── canal-json.md │ │ │ │ ├── cdc-compatible-debezium-json.md │ │ │ │ ├── debezium-json.md │ │ │ │ ├── kafka-compatible-kafkaconnect-json.md │ │ │ │ ├── maxwell-json.md │ │ │ │ ├── ogg-json.md │ │ │ │ └── protobuf.md │ │ │ ├── sink/ │ │ │ │ ├── Activemq.md │ │ │ │ ├── Aerospike.md │ │ │ │ ├── Airtable.md │ │ │ │ ├── AmazonDynamoDB.md │ │ │ │ ├── AmazonSqs.md │ │ │ │ ├── Assert.md │ │ │ │ ├── Cassandra.md │ │ │ │ ├── Clickhouse.md │ │ │ │ ├── ClickhouseFile.md │ │ │ │ ├── Cloudberry.md │ │ │ │ ├── Console.md │ │ │ │ ├── CosFile.md │ │ │ │ ├── DB2.md │ │ │ │ ├── Databend.md │ │ │ │ ├── Datahub.md │ │ │ │ ├── DingTalk.md │ │ │ │ ├── Doris.md │ │ │ │ ├── Druid.md │ │ │ │ ├── DuckDB.md │ │ │ │ ├── Easysearch.md │ │ │ │ ├── Elasticsearch.md │ │ │ │ ├── Email.md │ │ │ │ ├── Enterprise-WeChat.md │ │ │ │ ├── Feishu.md │ │ │ │ ├── Fluss.md │ │ │ │ ├── FtpFile.md │ │ │ │ ├── GoogleFirestore.md │ │ │ │ ├── GraphQL.md │ │ │ │ ├── Greenplum.md │ │ │ │ ├── Hbase.md │ │ │ │ ├── HdfsFile.md │ │ │ │ ├── Hive.md │ │ │ │ ├── Http.md │ │ │ │ ├── Hudi.md │ │ │ │ ├── HugeGraph.md │ │ │ │ ├── Iceberg.md │ │ │ │ ├── InfluxDB.md │ │ │ │ ├── IoTDB.md │ │ │ │ ├── IoTDBv2.md │ │ │ │ ├── Jdbc.md │ │ │ │ ├── Kafka.md │ │ │ │ ├── Kingbase.md │ │ │ │ ├── Kudu.md │ │ │ │ ├── Lance.md │ │ │ │ ├── LocalFile.md │ │ │ │ ├── Maxcompute.md │ │ │ │ ├── Milvus.md │ │ │ │ ├── MongoDB.md │ │ │ │ ├── Mysql.md │ │ │ │ ├── Neo4j.md │ │ │ │ ├── ObsFile.md │ │ │ │ ├── OceanBase.md │ │ │ │ ├── Oracle.md │ │ │ │ ├── OssFile.md │ │ │ │ ├── OssJindoFile.md │ │ │ │ ├── Paimon.md │ │ │ │ ├── Phoenix.md │ │ │ │ ├── PostgreSql.md │ │ │ │ ├── Prometheus.md │ │ │ │ ├── Pulsar.md │ │ │ │ ├── Qdrant.md │ │ │ │ ├── Rabbitmq.md │ │ │ │ ├── Redis.md │ │ │ │ ├── Redshift.md │ │ │ │ ├── RocketMQ.md │ │ │ │ ├── S3-Redshift.md │ │ │ │ ├── S3File.md │ │ │ │ ├── SelectDB-Cloud.md │ │ │ │ ├── SensorsData.md │ │ │ │ ├── Sentry.md │ │ │ │ ├── SftpFile.md │ │ │ │ ├── Slack.md │ │ │ │ ├── Sls.md │ │ │ │ ├── Snowflake.md │ │ │ │ ├── Socket.md │ │ │ │ ├── SqlServer.md │ │ │ │ ├── StarRocks.md │ │ │ │ ├── TDengine.md │ │ │ │ ├── Tablestore.md │ │ │ │ ├── Typesense.md │ │ │ │ └── Vertica.md │ │ │ └── source/ │ │ │ ├── Airtable.md │ │ │ ├── AmazonDynamoDB.md │ │ │ ├── AmazonSqs.md │ │ │ ├── Cassandra.md │ │ │ ├── Clickhouse.md │ │ │ ├── Cloudberry.md │ │ │ ├── CosFile.md │ │ │ ├── DB2.md │ │ │ ├── Databend.md │ │ │ ├── Doris.md │ │ │ ├── DuckDB.md │ │ │ ├── Easysearch.md │ │ │ ├── Elasticsearch.md │ │ │ ├── FakeSource.md │ │ │ ├── FtpFile.md │ │ │ ├── Github.md │ │ │ ├── Gitlab.md │ │ │ ├── GoogleSheets.md │ │ │ ├── GraphQL.md │ │ │ ├── Greenplum.md │ │ │ ├── Hbase.md │ │ │ ├── HdfsFile.md │ │ │ ├── Hive.md │ │ │ ├── HiveJdbc.md │ │ │ ├── Http.md │ │ │ ├── Iceberg.md │ │ │ ├── InfluxDB.md │ │ │ ├── IoTDB.md │ │ │ ├── IoTDBv2.md │ │ │ ├── Jdbc.md │ │ │ ├── Jira.md │ │ │ ├── Kafka.md │ │ │ ├── Kingbase.md │ │ │ ├── Klaviyo.md │ │ │ ├── Kudu.md │ │ │ ├── Lemlist.md │ │ │ ├── LocalFile.md │ │ │ ├── Maxcompute.md │ │ │ ├── Milvus.md │ │ │ ├── MongoDB-CDC.md │ │ │ ├── MongoDB.md │ │ │ ├── MyHours.md │ │ │ ├── MySQL-CDC.md │ │ │ ├── Mysql.md │ │ │ ├── Neo4j.md │ │ │ ├── Notion.md │ │ │ ├── ObsFile.md │ │ │ ├── OceanBase.md │ │ │ ├── OneSignal.md │ │ │ ├── OpenMldb.md │ │ │ ├── Opengauss-CDC.md │ │ │ ├── Oracle-CDC.md │ │ │ ├── Oracle.md │ │ │ ├── OssFile.md │ │ │ ├── OssJindoFile.md │ │ │ ├── Paimon.md │ │ │ ├── Persistiq.md │ │ │ ├── Phoenix.md │ │ │ ├── PostgreSQL-CDC.md │ │ │ ├── PostgreSQL.md │ │ │ ├── Prometheus.md │ │ │ ├── Pulsar.md │ │ │ ├── Qdrant.md │ │ │ ├── Rabbitmq.md │ │ │ ├── Redis.md │ │ │ ├── Redshift.md │ │ │ ├── RocketMQ.md │ │ │ ├── S3File.md │ │ │ ├── SftpFile.md │ │ │ ├── Sls.md │ │ │ ├── Snowflake.md │ │ │ ├── Socket.md │ │ │ ├── SqlServer-CDC.md │ │ │ ├── SqlServer.md │ │ │ ├── StarRocks.md │ │ │ ├── TDengine.md │ │ │ ├── Tablestore.md │ │ │ ├── TiDB-CDC.md │ │ │ ├── Typesense.md │ │ │ ├── Vertica.md │ │ │ └── Web3j.md │ │ ├── developer/ │ │ │ ├── coding-guide.md │ │ │ ├── contribute-plugin.md │ │ │ ├── contribute-transform-v2-guide.md │ │ │ ├── docs-format-specification.md │ │ │ ├── how-to-create-your-connector.md │ │ │ ├── new-license.md │ │ │ └── setup.md │ │ ├── engines/ │ │ │ ├── command/ │ │ │ │ ├── connector-check.md │ │ │ │ └── usage.mdx │ │ │ ├── event-listener.md │ │ │ ├── flink.md │ │ │ ├── overview.md │ │ │ ├── spark.md │ │ │ └── zeta/ │ │ │ ├── about.md │ │ │ ├── checkpoint-storage.md │ │ │ ├── deployment.md │ │ │ ├── download-seatunnel.md │ │ │ ├── engine-jar-storage-mode.md │ │ │ ├── hybrid-cluster-deployment.md │ │ │ ├── local-mode-deployment.md │ │ │ ├── logging.md │ │ │ ├── resource-isolation.md │ │ │ ├── rest-api-v1.md │ │ │ ├── rest-api-v2.md │ │ │ ├── security.md │ │ │ ├── separated-cluster-deployment.md │ │ │ ├── slot-allocation-strategy.md │ │ │ ├── tcp.md │ │ │ ├── telemetry.md │ │ │ ├── tuning-guide.md │ │ │ ├── user-command.md │ │ │ └── web-ui.md │ │ ├── faq.md │ │ ├── getting-started/ │ │ │ ├── docker/ │ │ │ │ └── docker.md │ │ │ ├── kubernetes/ │ │ │ │ ├── helm.md │ │ │ │ └── kubernetes.mdx │ │ │ └── locally/ │ │ │ ├── deployment.md │ │ │ ├── quick-start-flink.md │ │ │ ├── quick-start-seatunnel-engine.md │ │ │ └── quick-start-spark.md │ │ ├── introduction/ │ │ │ ├── about.md │ │ │ ├── concepts/ │ │ │ │ ├── config.md │ │ │ │ ├── connector-v2-features.md │ │ │ │ ├── gravitino-type-mapping.md │ │ │ │ ├── incompatible-changes.md │ │ │ │ └── schema-feature.md │ │ │ ├── configuration/ │ │ │ │ ├── JobEnvConfig.md │ │ │ │ ├── config-encryption-decryption.md │ │ │ │ ├── metalake.md │ │ │ │ ├── schema-evolution.md │ │ │ │ ├── sink-options-placeholders.md │ │ │ │ ├── speed-limit.md │ │ │ │ └── sql-config.md │ │ │ └── how-it-works.md │ │ ├── tools/ │ │ │ ├── overview.md │ │ │ ├── seatunnel-mcp.md │ │ │ ├── seatunnel-skill.md │ │ │ └── x2seatunnel.md │ │ └── transforms/ │ │ ├── common-options/ │ │ │ └── common-options.md │ │ ├── copy.md │ │ ├── data-validator.md │ │ ├── define-sink-type.md │ │ ├── dynamic-compile.md │ │ ├── embedding.md │ │ ├── encrypt.md │ │ ├── field-mapper.md │ │ ├── field-rename.md │ │ ├── filter-rowkind.md │ │ ├── filter.md │ │ ├── jsonpath.md │ │ ├── llm.md │ │ ├── metadata.md │ │ ├── regexextract.md │ │ ├── replace.md │ │ ├── rowkind-extractor.md │ │ ├── split.md │ │ ├── sql-functions.md │ │ ├── sql-udf.md │ │ ├── sql.md │ │ ├── table-filter.md │ │ ├── table-merge.md │ │ ├── table-rename.md │ │ └── transform-multi-table.md │ ├── sidebars.js │ └── zh/ │ ├── architecture/ │ │ ├── api-design/ │ │ │ ├── catalog-table.md │ │ │ ├── sink-architecture.md │ │ │ ├── source-architecture.md │ │ │ └── translation-layer.md │ │ ├── design-philosophy.md │ │ ├── engine/ │ │ │ ├── dag-execution.md │ │ │ ├── engine-architecture.md │ │ │ └── resource-management.md │ │ ├── fault-tolerance/ │ │ │ ├── checkpoint-mechanism.md │ │ │ └── exactly-once.md │ │ ├── features/ │ │ │ └── multi-table.md │ │ └── overview.md │ ├── connectors/ │ │ ├── changelog/ │ │ │ ├── connector-activemq.md │ │ │ ├── connector-aerospike.md │ │ │ ├── connector-amazondynamodb.md │ │ │ ├── connector-amazonsqs.md │ │ │ ├── connector-assert.md │ │ │ ├── connector-cassandra.md │ │ │ ├── connector-cdc-base.md │ │ │ ├── connector-cdc-mongodb.md │ │ │ ├── connector-cdc-mysql.md │ │ │ ├── connector-cdc-opengauss.md │ │ │ ├── connector-cdc-oracle.md │ │ │ ├── connector-cdc-postgres.md │ │ │ ├── connector-cdc-sqlserver.md │ │ │ ├── connector-cdc-tidb.md │ │ │ ├── connector-cdc.md │ │ │ ├── connector-clickhouse.md │ │ │ ├── connector-cloudberry.md │ │ │ ├── connector-common.md │ │ │ ├── connector-console.md │ │ │ ├── connector-databend.md │ │ │ ├── connector-datahub.md │ │ │ ├── connector-dingtalk.md │ │ │ ├── connector-doris.md │ │ │ ├── connector-druid.md │ │ │ ├── connector-easysearch.md │ │ │ ├── connector-elasticsearch.md │ │ │ ├── connector-email.md │ │ │ ├── connector-fake.md │ │ │ ├── connector-file-base-hadoop.md │ │ │ ├── connector-file-base.md │ │ │ ├── connector-file-cos.md │ │ │ ├── connector-file-ftp.md │ │ │ ├── connector-file-hadoop.md │ │ │ ├── connector-file-jindo-oss.md │ │ │ ├── connector-file-local.md │ │ │ ├── connector-file-obs.md │ │ │ ├── connector-file-oss-jindo.md │ │ │ ├── connector-file-oss.md │ │ │ ├── connector-file-s3.md │ │ │ ├── connector-file-sftp.md │ │ │ ├── connector-file.md │ │ │ ├── connector-fluss.md │ │ │ ├── connector-google-firestore.md │ │ │ ├── connector-google-sheets.md │ │ │ ├── connector-graphql.md │ │ │ ├── connector-hbase.md │ │ │ ├── connector-hive.md │ │ │ ├── connector-http-airtable.md │ │ │ ├── connector-http-base.md │ │ │ ├── connector-http-feishu.md │ │ │ ├── connector-http-github.md │ │ │ ├── connector-http-gitlab.md │ │ │ ├── connector-http-jira.md │ │ │ ├── connector-http-klaviyo.md │ │ │ ├── connector-http-lemlist.md │ │ │ ├── connector-http-myhours.md │ │ │ ├── connector-http-notion.md │ │ │ ├── connector-http-onesignal.md │ │ │ ├── connector-http-persistiq.md │ │ │ ├── connector-http-wechat.md │ │ │ ├── connector-http.md │ │ │ ├── connector-hudi.md │ │ │ ├── connector-hugegraph.md │ │ │ ├── connector-iceberg.md │ │ │ ├── connector-influxdb.md │ │ │ ├── connector-iotdb.md │ │ │ ├── connector-jdbc.md │ │ │ ├── connector-kafka.md │ │ │ ├── connector-kudu.md │ │ │ ├── connector-lance.md │ │ │ ├── connector-maxcompute.md │ │ │ ├── connector-milvus.md │ │ │ ├── connector-mongodb.md │ │ │ ├── connector-neo4j.md │ │ │ ├── connector-openmldb.md │ │ │ ├── connector-paimon.md │ │ │ ├── connector-prometheus.md │ │ │ ├── connector-pulsar.md │ │ │ ├── connector-qdrant.md │ │ │ ├── connector-rabbitmq.md │ │ │ ├── connector-redis.md │ │ │ ├── connector-rocketmq.md │ │ │ ├── connector-s3-redshift.md │ │ │ ├── connector-selectdb-cloud.md │ │ │ ├── connector-sensorsdata.md │ │ │ ├── connector-sentry.md │ │ │ ├── connector-slack.md │ │ │ ├── connector-sls.md │ │ │ ├── connector-socket.md │ │ │ ├── connector-starrocks.md │ │ │ ├── connector-tablestore.md │ │ │ ├── connector-tdengine.md │ │ │ ├── connector-typesense.md │ │ │ └── connector-web3j.md │ │ ├── common-options/ │ │ │ ├── sink-common-options.md │ │ │ └── source-common-options.md │ │ ├── connector-isolated-dependency.md │ │ ├── formats/ │ │ │ ├── avro.md │ │ │ ├── canal-json.md │ │ │ ├── cdc-compatible-debezium-json.md │ │ │ ├── debezium-json.md │ │ │ ├── kafka-compatible-kafkaconnect-json.md │ │ │ ├── maxwell-json.md │ │ │ ├── ogg-json.md │ │ │ └── protobuf.md │ │ ├── sink/ │ │ │ ├── Activemq.md │ │ │ ├── Aerospike.md │ │ │ ├── Airtable.md │ │ │ ├── AmazonDynamoDB.md │ │ │ ├── AmazonSqs.md │ │ │ ├── Assert.md │ │ │ ├── Cassandra.md │ │ │ ├── Clickhouse.md │ │ │ ├── ClickhouseFile.md │ │ │ ├── Cloudberry.md │ │ │ ├── Console.md │ │ │ ├── CosFile.md │ │ │ ├── DB2.md │ │ │ ├── Databend.md │ │ │ ├── Datahub.md │ │ │ ├── DingTalk.md │ │ │ ├── Doris.md │ │ │ ├── Druid.md │ │ │ ├── DuckDB.md │ │ │ ├── Easysearch.md │ │ │ ├── Elasticsearch.md │ │ │ ├── Email.md │ │ │ ├── Enterprise-WeChat.md │ │ │ ├── Feishu.md │ │ │ ├── Fluss.md │ │ │ ├── FtpFile.md │ │ │ ├── GoogleFirestore.md │ │ │ ├── GraphQL.md │ │ │ ├── Greenplum.md │ │ │ ├── Hbase.md │ │ │ ├── HdfsFile.md │ │ │ ├── Hive.md │ │ │ ├── Http.md │ │ │ ├── Hudi.md │ │ │ ├── HugeGraph.md │ │ │ ├── Iceberg.md │ │ │ ├── InfluxDB.md │ │ │ ├── IoTDB.md │ │ │ ├── IoTDBv2.md │ │ │ ├── Jdbc.md │ │ │ ├── Kafka.md │ │ │ ├── Kingbase.md │ │ │ ├── Kudu.md │ │ │ ├── Lance.md │ │ │ ├── LocalFile.md │ │ │ ├── Maxcompute.md │ │ │ ├── Milvus.md │ │ │ ├── MongoDB.md │ │ │ ├── Mysql.md │ │ │ ├── Neo4j.md │ │ │ ├── ObsFile.md │ │ │ ├── OceanBase.md │ │ │ ├── Oracle.md │ │ │ ├── OssFile.md │ │ │ ├── OssJindoFile.md │ │ │ ├── Paimon.md │ │ │ ├── Phoenix.md │ │ │ ├── PostgreSql.md │ │ │ ├── Prometheus.md │ │ │ ├── Pulsar.md │ │ │ ├── Qdrant.md │ │ │ ├── Rabbitmq.md │ │ │ ├── Redis.md │ │ │ ├── Redshift.md │ │ │ ├── RocketMQ.md │ │ │ ├── S3-Redshift.md │ │ │ ├── S3File.md │ │ │ ├── SelectDB-Cloud.md │ │ │ ├── SensorsData.md │ │ │ ├── Sentry.md │ │ │ ├── SftpFile.md │ │ │ ├── Slack.md │ │ │ ├── Sls.md │ │ │ ├── Snowflake.md │ │ │ ├── Socket.md │ │ │ ├── SqlServer.md │ │ │ ├── StarRocks.md │ │ │ ├── TDengine.md │ │ │ ├── Tablestore.md │ │ │ ├── Typesense.md │ │ │ └── Vertica.md │ │ └── source/ │ │ ├── Airtable.md │ │ ├── AmazonDynamoDB.md │ │ ├── AmazonSqs.md │ │ ├── Cassandra.md │ │ ├── Clickhouse.md │ │ ├── Cloudberry.md │ │ ├── CosFile.md │ │ ├── DB2.md │ │ ├── Databend.md │ │ ├── Doris.md │ │ ├── DuckDB.md │ │ ├── Easysearch.md │ │ ├── Elasticsearch.md │ │ ├── FakeSource.md │ │ ├── FtpFile.md │ │ ├── Github.md │ │ ├── Gitlab.md │ │ ├── GoogleSheets.md │ │ ├── GraphQL.md │ │ ├── Greenplum.md │ │ ├── Hbase.md │ │ ├── HdfsFile.md │ │ ├── Hive.md │ │ ├── HiveJdbc.md │ │ ├── Http.md │ │ ├── Iceberg.md │ │ ├── InfluxDB.md │ │ ├── IoTDB.md │ │ ├── IoTDBv2.md │ │ ├── Jdbc.md │ │ ├── Jira.md │ │ ├── Kafka.md │ │ ├── Kingbase.md │ │ ├── Klaviyo.md │ │ ├── Kudu.md │ │ ├── Lemlist.md │ │ ├── LocalFile.md │ │ ├── Maxcompute.md │ │ ├── Milvus.md │ │ ├── MongoDB-CDC.md │ │ ├── MongoDB.md │ │ ├── MyHours.md │ │ ├── MySQL-CDC.md │ │ ├── Mysql.md │ │ ├── Neo4j.md │ │ ├── Notion.md │ │ ├── ObsFile.md │ │ ├── OceanBase.md │ │ ├── OneSignal.md │ │ ├── OpenMldb.md │ │ ├── Opengauss-CDC.md │ │ ├── Oracle-CDC.md │ │ ├── Oracle.md │ │ ├── OssFile.md │ │ ├── OssJindoFile.md │ │ ├── Paimon.md │ │ ├── Persistiq.md │ │ ├── Phoenix.md │ │ ├── PostgreSQL-CDC.md │ │ ├── PostgreSQL.md │ │ ├── Prometheus.md │ │ ├── Pulsar.md │ │ ├── Qdrant.md │ │ ├── Rabbitmq.md │ │ ├── Redis.md │ │ ├── Redshift.md │ │ ├── RocketMQ.md │ │ ├── S3File.md │ │ ├── SftpFile.md │ │ ├── Sls.md │ │ ├── Snowflake.md │ │ ├── Socket.md │ │ ├── SqlServer-CDC.md │ │ ├── SqlServer.md │ │ ├── StarRocks.md │ │ ├── TDengine.md │ │ ├── Tablestore.md │ │ ├── TiDB-CDC.md │ │ ├── Typesense.md │ │ ├── Vertica.md │ │ └── Web3j.md │ ├── developer/ │ │ ├── coding-guide.md │ │ ├── contribute-plugin.md │ │ ├── contribute-transform-v2-guide.md │ │ ├── docs-format-specification.md │ │ ├── how-to-create-your-connector.md │ │ ├── new-license.md │ │ └── setup.md │ ├── engines/ │ │ ├── command/ │ │ │ ├── connector-check.md │ │ │ └── usage.mdx │ │ ├── event-listener.md │ │ ├── flink.md │ │ ├── overview.md │ │ ├── spark.md │ │ └── zeta/ │ │ ├── about.md │ │ ├── checkpoint-storage.md │ │ ├── deployment.md │ │ ├── download-seatunnel.md │ │ ├── engine-jar-storage-mode.md │ │ ├── hybrid-cluster-deployment.md │ │ ├── local-mode-deployment.md │ │ ├── logging.md │ │ ├── resource-isolation.md │ │ ├── rest-api-v1.md │ │ ├── rest-api-v2.md │ │ ├── security.md │ │ ├── separated-cluster-deployment.md │ │ ├── slot-allocation-strategy.md │ │ ├── tcp.md │ │ ├── telemetry.md │ │ ├── tuning-guide.md │ │ ├── user-command.md │ │ └── web-ui.md │ ├── faq.md │ ├── getting-started/ │ │ ├── docker/ │ │ │ └── docker.md │ │ ├── kubernetes/ │ │ │ ├── helm.md │ │ │ └── kubernetes.mdx │ │ └── locally/ │ │ ├── deployment.md │ │ ├── quick-start-flink.md │ │ ├── quick-start-seatunnel-engine.md │ │ └── quick-start-spark.md │ ├── introduction/ │ │ ├── about.md │ │ ├── concepts/ │ │ │ ├── config.md │ │ │ ├── connector-v2-features.md │ │ │ ├── gravitino-type-mapping.md │ │ │ ├── incompatible-changes.md │ │ │ └── schema-feature.md │ │ ├── configuration/ │ │ │ ├── JobEnvConfig.md │ │ │ ├── config-encryption-decryption.md │ │ │ ├── metalake.md │ │ │ ├── schema-evolution.md │ │ │ ├── sink-options-placeholders.md │ │ │ ├── speed-limit.md │ │ │ └── sql-config.md │ │ └── how-it-works.md │ ├── tools/ │ │ ├── overview.md │ │ ├── seatunnel-mcp.md │ │ ├── seatunnel-skill.md │ │ └── x2seatunnel.md │ └── transforms/ │ ├── common-options/ │ │ └── common-options.md │ ├── copy.md │ ├── data-validator.md │ ├── define-sink-type.md │ ├── dynamic-compile.md │ ├── embedding.md │ ├── encrypt.md │ ├── field-mapper.md │ ├── field-rename.md │ ├── filter-rowkind.md │ ├── filter.md │ ├── jsonpath.md │ ├── llm.md │ ├── metadata.md │ ├── regexextract.md │ ├── replace.md │ ├── rowkind-extractor.md │ ├── split.md │ ├── sql-functions.md │ ├── sql-udf.md │ ├── sql.md │ ├── table-filter.md │ ├── table-merge.md │ ├── table-rename.md │ └── transform-multi-table.md ├── mvnw ├── mvnw.cmd ├── plugin-mapping.properties ├── plugins/ │ └── README.md ├── pom.xml ├── seatunnel-api/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── api/ │ │ ├── annotation/ │ │ │ └── Experimental.java │ │ ├── common/ │ │ │ ├── JobContext.java │ │ │ ├── PluginIdentifier.java │ │ │ ├── PluginIdentifierInterface.java │ │ │ ├── PrepareFailException.java │ │ │ ├── SeaTunnelAPIErrorCode.java │ │ │ ├── SeaTunnelPluginLifeCycle.java │ │ │ └── metrics/ │ │ │ ├── AbstractMetricsContext.java │ │ │ ├── Counter.java │ │ │ ├── JobMetrics.java │ │ │ ├── Measurement.java │ │ │ ├── MeasurementPredicates.java │ │ │ ├── Meter.java │ │ │ ├── Metric.java │ │ │ ├── MetricNames.java │ │ │ ├── MetricTags.java │ │ │ ├── MetricsContext.java │ │ │ ├── RawJobMetrics.java │ │ │ ├── ThreadSafeCounter.java │ │ │ ├── ThreadSafeQPSMeter.java │ │ │ └── Unit.java │ │ ├── configuration/ │ │ │ ├── ConfigAdapter.java │ │ │ ├── ConfigShade.java │ │ │ ├── Option.java │ │ │ ├── Options.java │ │ │ ├── ReadonlyConfig.java │ │ │ ├── SingleChoiceOption.java │ │ │ └── util/ │ │ │ ├── Condition.java │ │ │ ├── ConfigUtil.java │ │ │ ├── ConfigValidator.java │ │ │ ├── Expression.java │ │ │ ├── OptionMark.java │ │ │ ├── OptionRule.java │ │ │ ├── OptionUtil.java │ │ │ ├── OptionValidationException.java │ │ │ └── RequiredOption.java │ │ ├── env/ │ │ │ └── ParsingMode.java │ │ ├── event/ │ │ │ ├── DefaultEventProcessor.java │ │ │ ├── Event.java │ │ │ ├── EventHandler.java │ │ │ ├── EventListener.java │ │ │ ├── EventProcessor.java │ │ │ ├── EventType.java │ │ │ ├── LifecycleEvent.java │ │ │ └── LoggingEventHandler.java │ │ ├── metalake/ │ │ │ ├── MetaLakeFactory.java │ │ │ ├── MetaLakeTableSchemaConvertor.java │ │ │ ├── MetalakeClient.java │ │ │ ├── MetalakeConfigUtils.java │ │ │ ├── TableSchemaDiscoverer.java │ │ │ └── gravitino/ │ │ │ ├── GravitinoClient.java │ │ │ └── GravitinoTableSchemaConvertor.java │ │ ├── options/ │ │ │ ├── ConnectorCommonOptions.java │ │ │ ├── EnvCommonOptions.java │ │ │ ├── EnvOptionRule.java │ │ │ ├── SinkConnectorCommonOptions.java │ │ │ ├── SourceConnectorCommonOptions.java │ │ │ └── table/ │ │ │ ├── CatalogOptions.java │ │ │ ├── ColumnOptions.java │ │ │ ├── ConstraintKeyOptions.java │ │ │ ├── FieldOptions.java │ │ │ ├── FormatOptions.java │ │ │ ├── PrimaryKeyOptions.java │ │ │ ├── TableIdentifierOptions.java │ │ │ └── TableSchemaOptions.java │ │ ├── serialization/ │ │ │ ├── DefaultSerializer.java │ │ │ ├── DeserializationSchema.java │ │ │ ├── SerializationSchema.java │ │ │ └── Serializer.java │ │ ├── sink/ │ │ │ ├── DataSaveMode.java │ │ │ ├── DefaultSaveModeHandler.java │ │ │ ├── DefaultSinkWriterContext.java │ │ │ ├── MultiTableResourceManager.java │ │ │ ├── SaveModeExecuteLocation.java │ │ │ ├── SaveModeExecuteWrapper.java │ │ │ ├── SaveModeHandler.java │ │ │ ├── SaveModePlaceHolder.java │ │ │ ├── SchemaSaveMode.java │ │ │ ├── SeaTunnelSink.java │ │ │ ├── SinkAggregatedCommitter.java │ │ │ ├── SinkCommitter.java │ │ │ ├── SinkWriter.java │ │ │ ├── SupportMultiTableSink.java │ │ │ ├── SupportMultiTableSinkAggregatedCommitter.java │ │ │ ├── SupportMultiTableSinkWriter.java │ │ │ ├── SupportResourceShare.java │ │ │ ├── SupportSaveMode.java │ │ │ ├── SupportSchemaEvolutionSink.java │ │ │ ├── SupportSchemaEvolutionSinkWriter.java │ │ │ ├── TablePlaceholder.java │ │ │ ├── TablePlaceholderProcessor.java │ │ │ ├── event/ │ │ │ │ └── WriterCloseEvent.java │ │ │ └── multitablesink/ │ │ │ ├── MultiTableAggregatedCommitInfo.java │ │ │ ├── MultiTableCommitInfo.java │ │ │ ├── MultiTableSink.java │ │ │ ├── MultiTableSinkAggregatedCommitter.java │ │ │ ├── MultiTableSinkCommitter.java │ │ │ ├── MultiTableSinkFactory.java │ │ │ ├── MultiTableSinkWriter.java │ │ │ ├── MultiTableState.java │ │ │ ├── MultiTableWriterRunnable.java │ │ │ ├── SinkContextProxy.java │ │ │ └── SinkIdentifier.java │ │ ├── source/ │ │ │ ├── Boundedness.java │ │ │ ├── Collector.java │ │ │ ├── SeaTunnelJobAware.java │ │ │ ├── SeaTunnelSource.java │ │ │ ├── SourceEvent.java │ │ │ ├── SourceReader.java │ │ │ ├── SourceSplit.java │ │ │ ├── SourceSplitEnumerator.java │ │ │ ├── SupportColumnProjection.java │ │ │ ├── SupportCoordinate.java │ │ │ ├── SupportParallelism.java │ │ │ ├── SupportSchemaEvolution.java │ │ │ └── event/ │ │ │ ├── EnumeratorCloseEvent.java │ │ │ ├── EnumeratorOpenEvent.java │ │ │ ├── MessageDelayedEvent.java │ │ │ ├── ReaderCloseEvent.java │ │ │ └── ReaderOpenEvent.java │ │ ├── state/ │ │ │ └── CheckpointListener.java │ │ ├── table/ │ │ │ ├── catalog/ │ │ │ │ ├── AbstractSchema.java │ │ │ │ ├── Catalog.java │ │ │ │ ├── CatalogTable.java │ │ │ │ ├── CatalogTableUtil.java │ │ │ │ ├── Column.java │ │ │ │ ├── ConstraintKey.java │ │ │ │ ├── DataTypeConvertor.java │ │ │ │ ├── InfoPreviewResult.java │ │ │ │ ├── MetadataColumn.java │ │ │ │ ├── MetadataSchema.java │ │ │ │ ├── PhysicalColumn.java │ │ │ │ ├── PreviewResult.java │ │ │ │ ├── PrimaryKey.java │ │ │ │ ├── SQLPreviewResult.java │ │ │ │ ├── SeaTunnelDataTypeConvertorUtil.java │ │ │ │ ├── TableIdentifier.java │ │ │ │ ├── TablePath.java │ │ │ │ ├── TableSchema.java │ │ │ │ ├── VectorIndex.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── CatalogException.java │ │ │ │ │ ├── DatabaseAlreadyExistException.java │ │ │ │ │ ├── DatabaseNotExistException.java │ │ │ │ │ ├── TableAlreadyExistException.java │ │ │ │ │ └── TableNotExistException.java │ │ │ │ └── schema/ │ │ │ │ ├── ReadonlyConfigParser.java │ │ │ │ └── TableSchemaParser.java │ │ │ ├── connector/ │ │ │ │ ├── DeserializationFormat.java │ │ │ │ ├── SerializationFormat.java │ │ │ │ ├── SupportReadingMetadata.java │ │ │ │ ├── TableSink.java │ │ │ │ ├── TableSource.java │ │ │ │ └── TableTransform.java │ │ │ ├── converter/ │ │ │ │ ├── BasicDataConverter.java │ │ │ │ ├── BasicDataTypeConverter.java │ │ │ │ ├── BasicTypeConverter.java │ │ │ │ ├── BasicTypeDefine.java │ │ │ │ ├── ConverterLoader.java │ │ │ │ ├── DataConverter.java │ │ │ │ ├── DataTypeConverter.java │ │ │ │ └── TypeConverter.java │ │ │ ├── factory/ │ │ │ │ ├── CatalogFactory.java │ │ │ │ ├── ChangeStreamTableSourceCheckpoint.java │ │ │ │ ├── ChangeStreamTableSourceFactory.java │ │ │ │ ├── ChangeStreamTableSourceState.java │ │ │ │ ├── DataTypeConvertorFactory.java │ │ │ │ ├── Factory.java │ │ │ │ ├── FactoryException.java │ │ │ │ ├── FactoryUtil.java │ │ │ │ ├── MultiTableFactoryContext.java │ │ │ │ ├── SerializationFormatFactory.java │ │ │ │ ├── TableFactoryContext.java │ │ │ │ ├── TableSinkFactory.java │ │ │ │ ├── TableSinkFactoryContext.java │ │ │ │ ├── TableSourceFactory.java │ │ │ │ ├── TableSourceFactoryContext.java │ │ │ │ ├── TableTransformFactory.java │ │ │ │ └── TableTransformFactoryContext.java │ │ │ ├── schema/ │ │ │ │ ├── SchemaChangeType.java │ │ │ │ ├── event/ │ │ │ │ │ ├── AlterTableAddColumnEvent.java │ │ │ │ │ ├── AlterTableChangeColumnEvent.java │ │ │ │ │ ├── AlterTableColumnEvent.java │ │ │ │ │ ├── AlterTableColumnsEvent.java │ │ │ │ │ ├── AlterTableDropColumnEvent.java │ │ │ │ │ ├── AlterTableEvent.java │ │ │ │ │ ├── AlterTableModifyColumnEvent.java │ │ │ │ │ ├── AlterTableNameEvent.java │ │ │ │ │ ├── SchemaChangeEvent.java │ │ │ │ │ └── TableEvent.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── SchemaCoordinationException.java │ │ │ │ │ ├── SchemaEvolutionErrorCode.java │ │ │ │ │ ├── SchemaEvolutionException.java │ │ │ │ │ ├── SchemaValidationException.java │ │ │ │ │ └── SinkWriterSchemaException.java │ │ │ │ └── handler/ │ │ │ │ ├── AlterTableEventHandler.java │ │ │ │ ├── AlterTableSchemaEventHandler.java │ │ │ │ ├── DataTypeChangeEventDispatcher.java │ │ │ │ ├── DataTypeChangeEventHandler.java │ │ │ │ ├── SchemaChangeEventHandler.java │ │ │ │ ├── TableSchemaChangeEventDispatcher.java │ │ │ │ └── TableSchemaChangeEventHandler.java │ │ │ └── type/ │ │ │ ├── ArrayType.java │ │ │ ├── BasicType.java │ │ │ ├── CommonOptions.java │ │ │ ├── CompositeType.java │ │ │ ├── DecimalArrayType.java │ │ │ ├── DecimalType.java │ │ │ ├── LocalTimeType.java │ │ │ ├── MapType.java │ │ │ ├── MetadataUtil.java │ │ │ ├── MultipleRowType.java │ │ │ ├── PrimitiveByteArrayType.java │ │ │ ├── Record.java │ │ │ ├── RowKind.java │ │ │ ├── SeaTunnelDataType.java │ │ │ ├── SeaTunnelRow.java │ │ │ ├── SeaTunnelRowAccessor.java │ │ │ ├── SeaTunnelRowType.java │ │ │ ├── SqlType.java │ │ │ ├── TypeUtil.java │ │ │ └── VectorType.java │ │ ├── tracing/ │ │ │ ├── MDCCallable.java │ │ │ ├── MDCComparator.java │ │ │ ├── MDCConsumer.java │ │ │ ├── MDCContext.java │ │ │ ├── MDCExecutor.java │ │ │ ├── MDCExecutorService.java │ │ │ ├── MDCFunction.java │ │ │ ├── MDCPredicate.java │ │ │ ├── MDCRunnable.java │ │ │ ├── MDCScheduledExecutorService.java │ │ │ ├── MDCStream.java │ │ │ ├── MDCSupplier.java │ │ │ └── MDCTracer.java │ │ └── transform/ │ │ ├── Collector.java │ │ ├── SeaTunnelFlatMapTransform.java │ │ ├── SeaTunnelMapTransform.java │ │ └── SeaTunnelTransform.java │ └── test/ │ ├── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── api/ │ │ ├── configuration/ │ │ │ ├── OptionTest.java │ │ │ ├── ReadableConfigTest.java │ │ │ └── util/ │ │ │ ├── ConditionTest.java │ │ │ ├── ConfigUtilTest.java │ │ │ ├── ConfigValidatorTest.java │ │ │ ├── OptionRuleTest.java │ │ │ ├── OptionUtilTest.java │ │ │ ├── SingleChoiceOptionTest.java │ │ │ ├── TestOptionConfig.java │ │ │ └── TestOptionConfigEnum.java │ │ ├── env/ │ │ │ └── EnvOptionRuleTest.java │ │ ├── metalake/ │ │ │ ├── TableSchemaDiscovererTest.java │ │ │ └── gravitino/ │ │ │ ├── GravitinoClientTest.java │ │ │ └── GravitinoTableSchemaConvertorTest.java │ │ ├── sink/ │ │ │ ├── DefaultSaveModeHandlerTest.java │ │ │ ├── TablePlaceholderProcessorTest.java │ │ │ └── multitablesink/ │ │ │ ├── MultiTableSinkAggregatedCommitterTest.java │ │ │ ├── MultiTableSinkCommitterTest.java │ │ │ └── MultiTableSinkWriterTest.java │ │ ├── table/ │ │ │ ├── catalog/ │ │ │ │ ├── CatalogTableTest.java │ │ │ │ ├── CatalogTableUtilTest.java │ │ │ │ ├── InMemoryCatalog.java │ │ │ │ ├── InMemoryCatalogFactory.java │ │ │ │ ├── InMemoryCatalogOptionRule.java │ │ │ │ ├── SeaTunnelDataTypeConvertorUtilTest.java │ │ │ │ └── schema/ │ │ │ │ ├── BaseConfigParserTest.java │ │ │ │ └── ReadonlyConfigParserTest.java │ │ │ ├── schema/ │ │ │ │ └── event/ │ │ │ │ └── EventTest.java │ │ │ └── type/ │ │ │ └── SeaTunnelRowTest.java │ │ └── tracing/ │ │ └── MDCTracerTest.java │ └── resources/ │ └── conf/ │ ├── catalog/ │ │ ├── schema_column.conf │ │ └── schema_field.conf │ ├── complex.schema.conf │ ├── config_special_schema.conf │ ├── default_tablepath.conf │ ├── generic_row.schema.conf │ ├── getCatalogTable.conf │ ├── json/ │ │ ├── metadata_json_from_meta_lake_hive.json │ │ └── metadata_json_from_meta_lake_pgsql.json │ ├── option-test.conf │ ├── partition_keys.schema.conf │ ├── simple.schema.conf │ └── table_schema_discoverer/ │ ├── multiple_tables_fields.conf │ ├── multiple_tables_mixed.conf │ ├── multiple_tables_no_schema_mixed_format.conf │ ├── multiple_tables_schema_url.conf │ ├── single_no_schema.conf │ ├── single_schema_field.conf │ └── single_schema_url.conf ├── seatunnel-ci-tools/ │ ├── pom.xml │ └── src/ │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── api/ │ ├── ChineseCharacterCheckTest.java │ ├── ConnectorOptionCheckTest.java │ ├── ImportClassCheckTest.java │ ├── SerialVersionUIDCheckerTest.java │ ├── SpotlessImportReplacementTest.java │ ├── UTClassNameCheckTest.java │ └── file/ │ ├── AllFileSpecificationCheckTest.java │ └── MarkdownTest.java ├── seatunnel-common/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── common/ │ │ ├── Constants.java │ │ ├── Handover.java │ │ ├── config/ │ │ │ ├── CheckConfigUtil.java │ │ │ ├── CheckResult.java │ │ │ ├── Common.java │ │ │ ├── ConfigRuntimeException.java │ │ │ ├── DeployMode.java │ │ │ └── TypesafeConfigUtils.java │ │ ├── constants/ │ │ │ ├── CollectionConstants.java │ │ │ ├── EngineType.java │ │ │ ├── JobMode.java │ │ │ ├── MetaLakeType.java │ │ │ └── PluginType.java │ │ ├── exception/ │ │ │ ├── CommonError.java │ │ │ ├── CommonErrorCode.java │ │ │ ├── CommonErrorCodeDeprecated.java │ │ │ ├── ExceptionParamsUtil.java │ │ │ ├── SeaTunnelErrorCode.java │ │ │ └── SeaTunnelRuntimeException.java │ │ └── utils/ │ │ ├── DateTimeUtils.java │ │ ├── DateUtils.java │ │ ├── EncodingUtils.java │ │ ├── ExceptionUtils.java │ │ ├── FileUtils.java │ │ ├── JdbcUrlUtil.java │ │ ├── JsonUtils.java │ │ ├── ParserException.java │ │ ├── PlaceholderUtils.java │ │ ├── ReflectionUtils.java │ │ ├── RetryUtils.java │ │ ├── SeaTunnelException.java │ │ ├── SerializationException.java │ │ ├── SerializationUtils.java │ │ ├── StringFormatUtils.java │ │ ├── TemporaryClassLoaderContext.java │ │ ├── TimeUtils.java │ │ ├── VariablesSubstitute.java │ │ ├── VectorUtils.java │ │ └── function/ │ │ ├── ConsumerWithException.java │ │ ├── FunctionWithException.java │ │ ├── RunnableWithException.java │ │ └── SupplierWithException.java │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── common/ │ ├── HandoverTest.java │ ├── config/ │ │ ├── CheckConfigUtilTest.java │ │ ├── CommonTest.java │ │ └── TypesafeConfigUtilsTest.java │ ├── exception/ │ │ └── ExceptionParamsUtilTest.java │ └── utils/ │ ├── DateTimeUtilsTest.java │ ├── DateUtilsTest.java │ ├── ExceptionUtilsTest.java │ ├── FileUtilsTest.java │ ├── JdbcUrlUtilTest.java │ ├── ReflectionUtilsTest.java │ ├── SerializationUtilsTest.java │ ├── StringFormatUtilsTest.java │ ├── TimeUtilsTest.java │ ├── VariablesSubstituteTest.java │ └── VectorUtilsTest.java ├── seatunnel-config/ │ ├── README.md │ ├── pom.xml │ ├── seatunnel-config-base/ │ │ └── pom.xml │ ├── seatunnel-config-shade/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── shade/ │ │ │ └── com/ │ │ │ └── typesafe/ │ │ │ └── config/ │ │ │ ├── ConfigMergeable.java │ │ │ ├── ConfigParseOptions.java │ │ │ └── impl/ │ │ │ ├── ConfigImpl.java │ │ │ ├── ConfigNodePath.java │ │ │ ├── ConfigParser.java │ │ │ ├── Path.java │ │ │ ├── PathParser.java │ │ │ ├── PropertiesParser.java │ │ │ ├── SimpleConfigObject.java │ │ │ └── Tokenizer.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ ├── config/ │ │ │ │ ├── CompleteTest.java │ │ │ │ ├── ConfigFactoryTest.java │ │ │ │ ├── ConfigTest.java │ │ │ │ ├── JsonFormatTest.java │ │ │ │ ├── SerializeTest.java │ │ │ │ └── utils/ │ │ │ │ └── FileUtils.java │ │ │ └── shade/ │ │ │ └── com/ │ │ │ └── typesafe/ │ │ │ └── config/ │ │ │ └── impl/ │ │ │ └── ConfigTest.java │ │ └── resources/ │ │ ├── factory/ │ │ │ └── config.conf │ │ ├── json/ │ │ │ ├── spark.batch.conf │ │ │ └── spark.batch.json │ │ └── seatunnel/ │ │ ├── configWithSpecialKey.conf │ │ ├── schema_columns.conf │ │ ├── schema_fields.conf │ │ ├── serialize.conf │ │ └── variables.conf │ └── seatunnel-config-sql/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── config/ │ │ └── sql/ │ │ ├── ConfigTemplate.java │ │ ├── SqlConfigAdapter.java │ │ ├── SqlConfigBuilder.java │ │ ├── model/ │ │ │ ├── BaseConfig.java │ │ │ ├── Option.java │ │ │ ├── SeaTunnelConfig.java │ │ │ ├── SinkConfig.java │ │ │ ├── SourceConfig.java │ │ │ └── TransformConfig.java │ │ └── utils/ │ │ └── Constant.java │ └── test/ │ ├── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── config/ │ │ └── sql/ │ │ └── SqlConfigBuilderTest.java │ └── resources/ │ └── sql-config.sql ├── seatunnel-connectors-v2/ │ ├── README.md │ ├── README.zh.md │ ├── connector-activemq/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── activemq/ │ │ │ ├── client/ │ │ │ │ └── ActivemqClient.java │ │ │ ├── config/ │ │ │ │ └── ActivemqSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── ActivemqConnectorErrorCode.java │ │ │ │ └── ActivemqConnectorException.java │ │ │ └── sink/ │ │ │ ├── ActivemqSink.java │ │ │ ├── ActivemqSinkFactory.java │ │ │ └── ActivemqSinkWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── activemq/ │ │ └── ActivemqFactoryTest.java │ ├── connector-aerospike/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── aerospike/ │ │ │ ├── config/ │ │ │ │ ├── AerospikeDataType.java │ │ │ │ ├── AerospikeSinkOptions.java │ │ │ │ └── DataFormatType.java │ │ │ ├── exception/ │ │ │ │ ├── AerospikeConnectorException.java │ │ │ │ └── AerospikeErrorCode.java │ │ │ └── sink/ │ │ │ ├── AerospikeSink.java │ │ │ ├── AerospikeSinkFactory.java │ │ │ ├── AerospikeSinkWriter.java │ │ │ └── AerospikeTypeConverter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── aerospike/ │ │ └── AerospikeFactoryTest.java │ ├── connector-amazondynamodb/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── amazondynamodb/ │ │ │ ├── config/ │ │ │ │ ├── AmazonDynamoDBBaseOptions.java │ │ │ │ ├── AmazonDynamoDBConfig.java │ │ │ │ ├── AmazonDynamoDBSinkOptions.java │ │ │ │ └── AmazonDynamoDBSourceOptions.java │ │ │ ├── exception/ │ │ │ │ └── AmazonDynamoDBConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ ├── SeaTunnelRowDeserializer.java │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── AmazonDynamoDBSink.java │ │ │ │ ├── AmazonDynamoDBSinkFactory.java │ │ │ │ ├── AmazonDynamoDBWriter.java │ │ │ │ └── DynamoDbSinkClient.java │ │ │ └── source/ │ │ │ ├── AmazonDynamoDBSource.java │ │ │ ├── AmazonDynamoDBSourceFactory.java │ │ │ ├── AmazonDynamoDBSourceReader.java │ │ │ ├── AmazonDynamoDBSourceSplit.java │ │ │ ├── AmazonDynamoDBSourceSplitEnumerator.java │ │ │ └── AmazonDynamoDBSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── amazondynamodb/ │ │ └── AmazonDynamoDBSourceFactoryTest.java │ ├── connector-amazonsqs/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── amazonsqs/ │ │ │ ├── config/ │ │ │ │ ├── AmazonSqsBaseOptions.java │ │ │ │ ├── AmazonSqsSinkOptions.java │ │ │ │ ├── AmazonSqsSourceConfig.java │ │ │ │ ├── AmazonSqsSourceOptions.java │ │ │ │ └── MessageFormat.java │ │ │ ├── deserialize/ │ │ │ │ ├── AmazonSqsDeserializer.java │ │ │ │ └── SeaTunnelRowDeserializer.java │ │ │ ├── exception/ │ │ │ │ └── AmazonSqsConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── AmazonSqsSink.java │ │ │ │ ├── AmazonSqsSinkFactory.java │ │ │ │ └── AmazonSqsSinkWriter.java │ │ │ └── source/ │ │ │ ├── AmazonSqsSource.java │ │ │ ├── AmazonSqsSourceFactory.java │ │ │ └── AmazonSqsSourceReader.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── amazonsqs/ │ │ └── AmazonSqsSourceFactoryTest.java │ ├── connector-assert/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── assertion/ │ │ │ ├── excecutor/ │ │ │ │ └── AssertExecutor.java │ │ │ ├── exception/ │ │ │ │ ├── AssertConnectorErrorCode.java │ │ │ │ └── AssertConnectorException.java │ │ │ ├── rule/ │ │ │ │ ├── AssertCatalogTableRule.java │ │ │ │ ├── AssertCatalogTableRuleParser.java │ │ │ │ ├── AssertFieldRule.java │ │ │ │ ├── AssertRuleParser.java │ │ │ │ └── AssertTableRule.java │ │ │ └── sink/ │ │ │ ├── AssertConfig.java │ │ │ ├── AssertSink.java │ │ │ ├── AssertSinkFactory.java │ │ │ ├── AssertSinkOptions.java │ │ │ ├── AssertSinkWriter.java │ │ │ ├── FieldRule.java │ │ │ ├── RowRule.java │ │ │ └── Rules.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── flink/ │ │ └── assertion/ │ │ ├── AssertExecutorTest.java │ │ ├── AssertFactoryTest.java │ │ └── rule/ │ │ └── AssertRuleParserTest.java │ ├── connector-cassandra/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cassandra/ │ │ │ ├── client/ │ │ │ │ └── CassandraClient.java │ │ │ ├── config/ │ │ │ │ ├── CassandraBaseOptions.java │ │ │ │ ├── CassandraParameters.java │ │ │ │ ├── CassandraSinkOptions.java │ │ │ │ └── CassandraSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── CassandraConnectorErrorCode.java │ │ │ │ └── CassandraConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── CassandraSink.java │ │ │ │ ├── CassandraSinkFactory.java │ │ │ │ └── CassandraSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── CassandraSource.java │ │ │ │ ├── CassandraSourceFactory.java │ │ │ │ └── CassandraSourceReader.java │ │ │ └── util/ │ │ │ └── TypeConvertUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── cassandra/ │ │ └── CassandraFactoryTest.java │ ├── connector-cdc/ │ │ ├── connector-cdc-base/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ ├── io/ │ │ │ │ │ └── debezium/ │ │ │ │ │ ├── connector/ │ │ │ │ │ │ └── base/ │ │ │ │ │ │ └── ChangeEventQueue.java │ │ │ │ │ ├── heartbeat/ │ │ │ │ │ │ ├── DefaultHeartbeatConnectionProvider.java │ │ │ │ │ │ └── HeartbeatFactory.java │ │ │ │ │ └── relational/ │ │ │ │ │ ├── HistorizedRelationalDatabaseConnectorConfig.java │ │ │ │ │ └── TableId.java │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── cdc/ │ │ │ │ ├── base/ │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── BaseSourceConfig.java │ │ │ │ │ │ ├── JdbcSourceConfig.java │ │ │ │ │ │ ├── JdbcSourceConfigFactory.java │ │ │ │ │ │ ├── JdbcSourceTableConfig.java │ │ │ │ │ │ ├── SourceConfig.java │ │ │ │ │ │ ├── StartupConfig.java │ │ │ │ │ │ └── StopConfig.java │ │ │ │ │ ├── dialect/ │ │ │ │ │ │ ├── DataSourceDialect.java │ │ │ │ │ │ └── JdbcDataSourceDialect.java │ │ │ │ │ ├── option/ │ │ │ │ │ │ ├── JdbcSourceOptions.java │ │ │ │ │ │ ├── SourceOptions.java │ │ │ │ │ │ ├── StartupMode.java │ │ │ │ │ │ └── StopMode.java │ │ │ │ │ ├── relational/ │ │ │ │ │ │ ├── JdbcSourceEventDispatcher.java │ │ │ │ │ │ └── connection/ │ │ │ │ │ │ ├── ConnectionPoolId.java │ │ │ │ │ │ ├── ConnectionPools.java │ │ │ │ │ │ ├── JdbcConnectionFactory.java │ │ │ │ │ │ ├── JdbcConnectionPoolFactory.java │ │ │ │ │ │ └── JdbcConnectionPools.java │ │ │ │ │ ├── schema/ │ │ │ │ │ │ ├── AbstractSchemaChangeResolver.java │ │ │ │ │ │ └── SchemaChangeResolver.java │ │ │ │ │ ├── source/ │ │ │ │ │ │ ├── BaseChangeStreamTableSourceFactory.java │ │ │ │ │ │ ├── IncrementalSource.java │ │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ │ ├── HybridSplitAssigner.java │ │ │ │ │ │ │ ├── IncrementalSourceEnumerator.java │ │ │ │ │ │ │ ├── IncrementalSplitAssigner.java │ │ │ │ │ │ │ ├── SnapshotSplitAssigner.java │ │ │ │ │ │ │ ├── SplitAssigner.java │ │ │ │ │ │ │ ├── splitter/ │ │ │ │ │ │ │ │ ├── AbstractJdbcSourceChunkSplitter.java │ │ │ │ │ │ │ │ ├── ChunkRange.java │ │ │ │ │ │ │ │ ├── ChunkSplitter.java │ │ │ │ │ │ │ │ └── JdbcSourceChunkSplitter.java │ │ │ │ │ │ │ └── state/ │ │ │ │ │ │ │ ├── HybridPendingSplitsState.java │ │ │ │ │ │ │ ├── IncrementalPhaseState.java │ │ │ │ │ │ │ ├── PendingSplitsState.java │ │ │ │ │ │ │ └── SnapshotPhaseState.java │ │ │ │ │ │ ├── event/ │ │ │ │ │ │ │ ├── CompletedSnapshotPhaseEvent.java │ │ │ │ │ │ │ ├── CompletedSnapshotSplitsAckEvent.java │ │ │ │ │ │ │ ├── CompletedSnapshotSplitsReportEvent.java │ │ │ │ │ │ │ └── SnapshotSplitWatermark.java │ │ │ │ │ │ ├── offset/ │ │ │ │ │ │ │ ├── Offset.java │ │ │ │ │ │ │ └── OffsetFactory.java │ │ │ │ │ │ ├── parser/ │ │ │ │ │ │ │ └── SeatunnelDDLParser.java │ │ │ │ │ │ ├── reader/ │ │ │ │ │ │ │ ├── IncrementalSourceReader.java │ │ │ │ │ │ │ ├── IncrementalSourceRecordEmitter.java │ │ │ │ │ │ │ ├── IncrementalSourceSplitReader.java │ │ │ │ │ │ │ └── external/ │ │ │ │ │ │ │ ├── FetchTask.java │ │ │ │ │ │ │ ├── Fetcher.java │ │ │ │ │ │ │ ├── IncrementalSourceScanFetcher.java │ │ │ │ │ │ │ ├── IncrementalSourceStreamFetcher.java │ │ │ │ │ │ │ └── JdbcSourceFetchTaskContext.java │ │ │ │ │ │ └── split/ │ │ │ │ │ │ ├── ChangeEventRecords.java │ │ │ │ │ │ ├── CompletedSnapshotSplitInfo.java │ │ │ │ │ │ ├── IncrementalSplit.java │ │ │ │ │ │ ├── SnapshotSplit.java │ │ │ │ │ │ ├── SourceRecords.java │ │ │ │ │ │ ├── SourceSplitBase.java │ │ │ │ │ │ ├── state/ │ │ │ │ │ │ │ ├── IncrementalSplitState.java │ │ │ │ │ │ │ ├── SnapshotSplitState.java │ │ │ │ │ │ │ └── SourceSplitStateBase.java │ │ │ │ │ │ └── wartermark/ │ │ │ │ │ │ ├── WatermarkEvent.java │ │ │ │ │ │ └── WatermarkKind.java │ │ │ │ │ └── utils/ │ │ │ │ │ ├── CatalogTableUtils.java │ │ │ │ │ ├── MessageDelayedEventLimiter.java │ │ │ │ │ ├── ObjectUtils.java │ │ │ │ │ └── SourceRecordUtils.java │ │ │ │ └── debezium/ │ │ │ │ ├── AbstractDebeziumDeserializationSchema.java │ │ │ │ ├── ConnectTableChangeSerializer.java │ │ │ │ ├── DebeziumDeserializationConverter.java │ │ │ │ ├── DebeziumDeserializationConverterFactory.java │ │ │ │ ├── DebeziumDeserializationSchema.java │ │ │ │ ├── DeserializeFormat.java │ │ │ │ ├── EmbeddedDatabaseHistory.java │ │ │ │ ├── MetadataConverter.java │ │ │ │ ├── row/ │ │ │ │ │ ├── DebeziumJsonDeserializeSchema.java │ │ │ │ │ ├── SeaTunnelRowDebeziumDeserializationConverters.java │ │ │ │ │ └── SeaTunnelRowDebeziumDeserializeSchema.java │ │ │ │ └── utils/ │ │ │ │ └── TemporalConversions.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ ├── jdbc/ │ │ │ │ └── source/ │ │ │ │ └── JdbcSourceChunkSplitterTest.java │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── cdc/ │ │ │ ├── base/ │ │ │ │ ├── schema/ │ │ │ │ │ └── AbstractSchemaChangeResolverTest.java │ │ │ │ ├── source/ │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ ├── HybridSplitAssignerTest.java │ │ │ │ │ │ └── splitter/ │ │ │ │ │ │ └── AbstractJdbcSourceChunkSplitterTest.java │ │ │ │ │ ├── reader/ │ │ │ │ │ │ ├── IncrementalSourceSplitReaderTest.java │ │ │ │ │ │ └── external/ │ │ │ │ │ │ └── IncrementalSourceStreamFetcherTest.java │ │ │ │ │ └── split/ │ │ │ │ │ └── state/ │ │ │ │ │ └── IncrementalSplitStateTest.java │ │ │ │ └── utils/ │ │ │ │ └── MessageDelayedEventLimiterTest.java │ │ │ └── debezium/ │ │ │ ├── format/ │ │ │ │ └── DebeziumJsonFormatTest.java │ │ │ └── row/ │ │ │ ├── DebeziumJsonDeserializeSchemaTest.java │ │ │ └── SeaTunnelRowDebeziumDeserializationConvertersTest.java │ │ ├── connector-cdc-mongodb/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── mongodb/ │ │ │ │ ├── MongodbIncrementalSource.java │ │ │ │ ├── MongodbIncrementalSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── MongodbIncrementalSourceOptions.java │ │ │ │ │ ├── MongodbSourceConfig.java │ │ │ │ │ ├── MongodbSourceConfigProvider.java │ │ │ │ │ └── MongodbSourceConstants.java │ │ │ │ ├── exception/ │ │ │ │ │ └── MongodbConnectorException.java │ │ │ │ ├── internal/ │ │ │ │ │ └── MongodbClientProvider.java │ │ │ │ ├── sender/ │ │ │ │ │ ├── MongoDBConnectorDeserializationSchema.java │ │ │ │ │ └── SerializableFunction.java │ │ │ │ ├── source/ │ │ │ │ │ ├── MongoDBRecordEmitter.java │ │ │ │ │ ├── dialect/ │ │ │ │ │ │ └── MongodbDialect.java │ │ │ │ │ ├── fetch/ │ │ │ │ │ │ ├── MongodbFetchTaskContext.java │ │ │ │ │ │ ├── MongodbScanFetchTask.java │ │ │ │ │ │ └── MongodbStreamFetchTask.java │ │ │ │ │ ├── offset/ │ │ │ │ │ │ ├── ChangeStreamDescriptor.java │ │ │ │ │ │ ├── ChangeStreamOffset.java │ │ │ │ │ │ └── ChangeStreamOffsetFactory.java │ │ │ │ │ └── splitters/ │ │ │ │ │ ├── MongodbChunkSplitter.java │ │ │ │ │ ├── SampleBucketSplitStrategy.java │ │ │ │ │ ├── ShardedSplitStrategy.java │ │ │ │ │ ├── SingleSplitStrategy.java │ │ │ │ │ ├── SplitContext.java │ │ │ │ │ ├── SplitStrategy.java │ │ │ │ │ └── SplitVectorSplitStrategy.java │ │ │ │ └── utils/ │ │ │ │ ├── BsonUtils.java │ │ │ │ ├── ChunkUtils.java │ │ │ │ ├── CollectionDiscoveryUtils.java │ │ │ │ ├── MongodbRecordUtils.java │ │ │ │ ├── MongodbUtils.java │ │ │ │ └── ResumeToken.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── mongodb/ │ │ │ ├── sender/ │ │ │ │ └── MongoDBConnectorDeserializationSchemaTest.java │ │ │ ├── source/ │ │ │ │ └── MongodbIncrementalSourceFactoryTest.java │ │ │ └── utils/ │ │ │ └── MongodbRecordUtilsHeartbeatTest.java │ │ ├── connector-cdc-mysql/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ ├── com/ │ │ │ │ │ └── github/ │ │ │ │ │ └── shyiko/ │ │ │ │ │ └── mysql/ │ │ │ │ │ └── binlog/ │ │ │ │ │ └── io/ │ │ │ │ │ └── BufferedSocketInputStream.java │ │ │ │ ├── io/ │ │ │ │ │ └── debezium/ │ │ │ │ │ └── connector/ │ │ │ │ │ └── mysql/ │ │ │ │ │ ├── GtidUtils.java │ │ │ │ │ ├── MySqlConnection.java │ │ │ │ │ ├── MySqlReadOnlyIncrementalSnapshotChangeEventSource.java │ │ │ │ │ ├── MySqlSnapshotChangeEventSource.java │ │ │ │ │ ├── MySqlStreamingChangeEventSource.java │ │ │ │ │ └── legacy/ │ │ │ │ │ ├── MySqlJdbcContext.java │ │ │ │ │ └── SnapshotReader.java │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── mysql/ │ │ │ │ ├── config/ │ │ │ │ │ ├── CustomMySqlConnectionConfiguration.java │ │ │ │ │ ├── MySqlIncrementalSourceOptions.java │ │ │ │ │ ├── MySqlSourceConfig.java │ │ │ │ │ ├── MySqlSourceConfigFactory.java │ │ │ │ │ └── ServerIdRange.java │ │ │ │ ├── source/ │ │ │ │ │ ├── MySqlDialect.java │ │ │ │ │ ├── MySqlIncrementalSource.java │ │ │ │ │ ├── MySqlIncrementalSourceFactory.java │ │ │ │ │ ├── MySqlSchemaChangeResolver.java │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ └── MySqlChunkSplitter.java │ │ │ │ │ ├── offset/ │ │ │ │ │ │ ├── BinlogOffset.java │ │ │ │ │ │ └── BinlogOffsetFactory.java │ │ │ │ │ ├── parser/ │ │ │ │ │ │ ├── CustomAlterTableParserListener.java │ │ │ │ │ │ ├── CustomColumnDefinitionParserListener.java │ │ │ │ │ │ ├── CustomDefaultValueParserListener.java │ │ │ │ │ │ ├── CustomMySqlAntlrDdlParser.java │ │ │ │ │ │ └── CustomMySqlAntlrDdlParserListener.java │ │ │ │ │ └── reader/ │ │ │ │ │ └── fetch/ │ │ │ │ │ ├── MySqlSourceFetchTaskContext.java │ │ │ │ │ ├── binlog/ │ │ │ │ │ │ └── MySqlBinlogFetchTask.java │ │ │ │ │ └── scan/ │ │ │ │ │ ├── MySqlSnapshotFetchTask.java │ │ │ │ │ ├── MySqlSnapshotSplitReadTask.java │ │ │ │ │ └── SnapshotSplitChangeEventSourceContext.java │ │ │ │ └── utils/ │ │ │ │ ├── ErrorMessageUtils.java │ │ │ │ ├── MySqlConnectionUtils.java │ │ │ │ ├── MySqlDdlBuilder.java │ │ │ │ ├── MySqlSchema.java │ │ │ │ ├── MySqlTypeUtils.java │ │ │ │ ├── MySqlUtils.java │ │ │ │ └── TableDiscoveryUtils.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ ├── com/ │ │ │ │ └── github/ │ │ │ │ └── shyiko/ │ │ │ │ └── mysql/ │ │ │ │ └── binlog/ │ │ │ │ └── io/ │ │ │ │ └── BufferedSocketInputStreamTest.java │ │ │ ├── io/ │ │ │ │ └── debezium/ │ │ │ │ └── connector/ │ │ │ │ └── mysql/ │ │ │ │ └── GtidUtilsTest.java │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── mysql/ │ │ │ ├── source/ │ │ │ │ └── MySqlIncrementalSourceFactoryTest.java │ │ │ ├── testutils/ │ │ │ │ ├── MySqlContainer.java │ │ │ │ ├── MySqlVersion.java │ │ │ │ └── UniqueDatabase.java │ │ │ └── utils/ │ │ │ ├── MySqlSchemaTest.java │ │ │ └── MySqlUtilsTest.java │ │ ├── connector-cdc-opengauss/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ ├── io/ │ │ │ │ └── debezium/ │ │ │ │ └── connector/ │ │ │ │ └── postgresql/ │ │ │ │ └── connection/ │ │ │ │ ├── PostgresConnection.java │ │ │ │ └── PostgresReplicationConnection.java │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── opengauss/ │ │ │ └── OpengaussIncrementalSourceFactory.java │ │ ├── connector-cdc-oracle/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ ├── io/ │ │ │ │ │ └── debezium/ │ │ │ │ │ └── connector/ │ │ │ │ │ └── oracle/ │ │ │ │ │ └── logminer/ │ │ │ │ │ ├── LogMinerAdapter.java │ │ │ │ │ ├── LogMinerStreamingChangeEventSource.java │ │ │ │ │ ├── logwriter/ │ │ │ │ │ │ └── ReadOnlyLogWriterFlushStrategy.java │ │ │ │ │ └── processor/ │ │ │ │ │ └── AbstractLogMinerEventProcessor.java │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── oracle/ │ │ │ │ ├── config/ │ │ │ │ │ ├── OracleSourceConfig.java │ │ │ │ │ └── OracleSourceConfigFactory.java │ │ │ │ ├── source/ │ │ │ │ │ ├── OracleDialect.java │ │ │ │ │ ├── OracleIncrementalSource.java │ │ │ │ │ ├── OracleIncrementalSourceFactory.java │ │ │ │ │ ├── OracleIncrementalSourceOptions.java │ │ │ │ │ ├── OracleSchemaChangeResolver.java │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ └── OracleChunkSplitter.java │ │ │ │ │ ├── offset/ │ │ │ │ │ │ ├── RedoLogOffset.java │ │ │ │ │ │ └── RedoLogOffsetFactory.java │ │ │ │ │ ├── parser/ │ │ │ │ │ │ ├── BaseParserListener.java │ │ │ │ │ │ ├── CustomAlterTableParserListener.java │ │ │ │ │ │ ├── CustomColumnDefinitionParserListener.java │ │ │ │ │ │ ├── CustomOracleAntlrDdlParser.java │ │ │ │ │ │ └── CustomOracleAntlrDdlParserListener.java │ │ │ │ │ └── reader/ │ │ │ │ │ └── fetch/ │ │ │ │ │ ├── OracleSourceFetchTaskContext.java │ │ │ │ │ ├── logminer/ │ │ │ │ │ │ ├── EventProcessorFactory.java │ │ │ │ │ │ └── OracleRedoLogFetchTask.java │ │ │ │ │ └── scan/ │ │ │ │ │ ├── OracleSnapshotFetchTask.java │ │ │ │ │ ├── OracleSnapshotSplitReadTask.java │ │ │ │ │ └── SnapshotSplitChangeEventSourceContext.java │ │ │ │ └── utils/ │ │ │ │ ├── OracleConnectionUtils.java │ │ │ │ ├── OracleSchema.java │ │ │ │ ├── OracleTypeUtils.java │ │ │ │ └── OracleUtils.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ ├── io/ │ │ │ │ └── debezium/ │ │ │ │ └── connector/ │ │ │ │ └── oracle/ │ │ │ │ └── logminer/ │ │ │ │ ├── logwriter/ │ │ │ │ │ └── ReadOnlyLogWriterFlushStrategyTest.java │ │ │ │ └── processor/ │ │ │ │ └── AbstractLogMinerEventProcessorTest.java │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── oracle/ │ │ │ ├── source/ │ │ │ │ ├── OracleIncrementalSourceFactoryTest.java │ │ │ │ └── parser/ │ │ │ │ └── OracleDdlParserTest.java │ │ │ └── utils/ │ │ │ └── OracleUtilsTest.java │ │ ├── connector-cdc-postgres/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ ├── io/ │ │ │ │ │ └── debezium/ │ │ │ │ │ └── connector/ │ │ │ │ │ └── postgresql/ │ │ │ │ │ ├── CustomPostgresValueConverter.java │ │ │ │ │ ├── PostgresObjectUtils.java │ │ │ │ │ ├── PostgresOffsetContext.java │ │ │ │ │ └── TypeRegistry.java │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── postgres/ │ │ │ │ ├── config/ │ │ │ │ │ ├── PostgresIncrementalSourceOptions.java │ │ │ │ │ ├── PostgresSourceConfig.java │ │ │ │ │ └── PostgresSourceConfigFactory.java │ │ │ │ ├── exception/ │ │ │ │ │ └── PostgresConnectorErrorCode.java │ │ │ │ ├── source/ │ │ │ │ │ ├── PostgresDialect.java │ │ │ │ │ ├── PostgresIncrementalSource.java │ │ │ │ │ ├── PostgresIncrementalSourceFactory.java │ │ │ │ │ ├── PostgresSourceOptions.java │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ └── PostgresChunkSplitter.java │ │ │ │ │ ├── offset/ │ │ │ │ │ │ ├── LsnOffset.java │ │ │ │ │ │ └── LsnOffsetFactory.java │ │ │ │ │ └── reader/ │ │ │ │ │ ├── PostgresSourceFetchTaskContext.java │ │ │ │ │ ├── snapshot/ │ │ │ │ │ │ ├── PostgresSnapshotFetchTask.java │ │ │ │ │ │ ├── PostgresSnapshotSplitReadTask.java │ │ │ │ │ │ └── SnapshotSplitChangeEventSourceContext.java │ │ │ │ │ └── wal/ │ │ │ │ │ └── PostgresWalFetchTask.java │ │ │ │ └── utils/ │ │ │ │ ├── PostgresConnectionUtils.java │ │ │ │ ├── PostgresSchema.java │ │ │ │ ├── PostgresTypeUtils.java │ │ │ │ ├── PostgresUtils.java │ │ │ │ └── TableDiscoveryUtils.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── postgres/ │ │ │ └── utils/ │ │ │ └── PostgresUtilsTest.java │ │ ├── connector-cdc-sqlserver/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ ├── io/ │ │ │ │ │ └── debezium/ │ │ │ │ │ └── connector/ │ │ │ │ │ └── sqlserver/ │ │ │ │ │ ├── SqlServerConnection.java │ │ │ │ │ └── SqlServerStreamingChangeEventSource.java │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── sqlserver/ │ │ │ │ ├── config/ │ │ │ │ │ ├── SqlServerSourceConfig.java │ │ │ │ │ └── SqlServerSourceConfigFactory.java │ │ │ │ ├── source/ │ │ │ │ │ ├── SqlServerDialect.java │ │ │ │ │ ├── SqlServerIncrementalSource.java │ │ │ │ │ ├── SqlServerIncrementalSourceFactory.java │ │ │ │ │ ├── SqlServerIncrementalSourceOptions.java │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ └── SqlServerChunkSplitter.java │ │ │ │ │ ├── offset/ │ │ │ │ │ │ ├── LsnOffset.java │ │ │ │ │ │ └── LsnOffsetFactory.java │ │ │ │ │ └── reader/ │ │ │ │ │ └── fetch/ │ │ │ │ │ ├── SqlServerSourceFetchTaskContext.java │ │ │ │ │ ├── scan/ │ │ │ │ │ │ ├── SnapshotSplitChangeEventSourceContext.java │ │ │ │ │ │ ├── SqlServerSnapshotFetchTask.java │ │ │ │ │ │ └── SqlServerSnapshotSplitReadTask.java │ │ │ │ │ └── transactionlog/ │ │ │ │ │ └── SqlServerTransactionLogFetchTask.java │ │ │ │ └── utils/ │ │ │ │ ├── SqlServerConnectionUtils.java │ │ │ │ ├── SqlServerSchema.java │ │ │ │ ├── SqlServerTypeUtils.java │ │ │ │ ├── SqlServerUtils.java │ │ │ │ └── TableDiscoveryUtils.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ ├── io/ │ │ │ │ └── debezium/ │ │ │ │ └── connector/ │ │ │ │ └── sqlserver/ │ │ │ │ └── SqlServerConnectionTest.java │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── sqlserver/ │ │ │ ├── source/ │ │ │ │ ├── SqlServerIncrementalSourceFactoryTest.java │ │ │ │ └── offset/ │ │ │ │ └── LsnOffsetTest.java │ │ │ └── utils/ │ │ │ └── SqlServerUtilsTest.java │ │ ├── connector-cdc-tidb/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ ├── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── cdc/ │ │ │ │ │ └── tidb/ │ │ │ │ │ └── source/ │ │ │ │ │ ├── TiDBSource.java │ │ │ │ │ ├── TiDBSourceFactory.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── TiDBSourceConfig.java │ │ │ │ │ │ └── TiDBSourceOptions.java │ │ │ │ │ ├── converter/ │ │ │ │ │ │ ├── DataConverter.java │ │ │ │ │ │ └── DefaultDataConverter.java │ │ │ │ │ ├── deserializer/ │ │ │ │ │ │ ├── AbstractSeaTunnelRowDeserializer.java │ │ │ │ │ │ ├── SeaTunnelRowSnapshotRecordDeserializer.java │ │ │ │ │ │ └── SeaTunnelRowStreamingRecordDeserializer.java │ │ │ │ │ ├── enumerator/ │ │ │ │ │ │ ├── TiDBSourceCheckpointState.java │ │ │ │ │ │ └── TiDBSourceSplitEnumerator.java │ │ │ │ │ ├── reader/ │ │ │ │ │ │ ├── RowKeyWithTs.java │ │ │ │ │ │ └── TiDBSourceReader.java │ │ │ │ │ ├── split/ │ │ │ │ │ │ └── TiDBSourceSplit.java │ │ │ │ │ └── utils/ │ │ │ │ │ └── TableKeyRangeUtils.java │ │ │ │ └── tikv/ │ │ │ │ └── common/ │ │ │ │ └── iterator/ │ │ │ │ └── ScanIterator.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── cdc/ │ │ │ └── tidb/ │ │ │ └── source/ │ │ │ └── SqlServerIncrementalSourceFactoryTest.java │ │ └── pom.xml │ ├── connector-clickhouse/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── clickhouse/ │ │ │ ├── catalog/ │ │ │ │ ├── ClickhouseCatalog.java │ │ │ │ ├── ClickhouseCatalogFactory.java │ │ │ │ └── ClickhouseTypeConverter.java │ │ │ ├── config/ │ │ │ │ ├── ClickhouseBaseOptions.java │ │ │ │ ├── ClickhouseFileCopyMethod.java │ │ │ │ ├── ClickhouseFileSinkOptions.java │ │ │ │ ├── ClickhouseSinkOptions.java │ │ │ │ ├── ClickhouseSourceConfig.java │ │ │ │ ├── ClickhouseSourceOptions.java │ │ │ │ ├── ClickhouseTableConfig.java │ │ │ │ ├── ClickhouseType.java │ │ │ │ ├── FileReaderOption.java │ │ │ │ ├── NodePassConfig.java │ │ │ │ └── ReaderOption.java │ │ │ ├── exception/ │ │ │ │ ├── ClickhouseConnectorErrorCode.java │ │ │ │ └── ClickhouseConnectorException.java │ │ │ ├── shard/ │ │ │ │ ├── Shard.java │ │ │ │ └── ShardMetadata.java │ │ │ ├── sink/ │ │ │ │ ├── client/ │ │ │ │ │ ├── ClickhouseBatchStatement.java │ │ │ │ │ ├── ClickhouseSink.java │ │ │ │ │ ├── ClickhouseSinkFactory.java │ │ │ │ │ ├── ClickhouseSinkWriter.java │ │ │ │ │ ├── ShardRouter.java │ │ │ │ │ └── executor/ │ │ │ │ │ ├── BufferedBatchStatementExecutor.java │ │ │ │ │ ├── FieldNamedPreparedStatement.java │ │ │ │ │ ├── InsertOrUpdateBatchStatementExecutor.java │ │ │ │ │ ├── JdbcBatchStatementExecutor.java │ │ │ │ │ ├── JdbcBatchStatementExecutorBuilder.java │ │ │ │ │ ├── JdbcRowConverter.java │ │ │ │ │ ├── ReduceBufferedBatchStatementExecutor.java │ │ │ │ │ ├── SimpleBatchStatementExecutor.java │ │ │ │ │ ├── SqlUtils.java │ │ │ │ │ └── StatementFactory.java │ │ │ │ ├── file/ │ │ │ │ │ ├── ClickhouseFileSink.java │ │ │ │ │ ├── ClickhouseFileSinkAggCommitter.java │ │ │ │ │ ├── ClickhouseFileSinkFactory.java │ │ │ │ │ ├── ClickhouseFileSinkWriter.java │ │ │ │ │ ├── ClickhouseTable.java │ │ │ │ │ ├── FileTransfer.java │ │ │ │ │ ├── FileTransferFactory.java │ │ │ │ │ ├── RsyncFileTransfer.java │ │ │ │ │ └── ScpFileTransfer.java │ │ │ │ └── inject/ │ │ │ │ ├── ArrayInjectFunction.java │ │ │ │ ├── BigDecimalInjectFunction.java │ │ │ │ ├── ClickhouseFieldInjectFunction.java │ │ │ │ ├── DateInjectFunction.java │ │ │ │ ├── DateTimeInjectFunction.java │ │ │ │ ├── DoubleInjectFunction.java │ │ │ │ ├── FloatInjectFunction.java │ │ │ │ ├── IntInjectFunction.java │ │ │ │ ├── LongInjectFunction.java │ │ │ │ ├── MapInjectFunction.java │ │ │ │ └── StringInjectFunction.java │ │ │ ├── source/ │ │ │ │ ├── ClickhousePart.java │ │ │ │ ├── ClickhouseSource.java │ │ │ │ ├── ClickhouseSourceFactory.java │ │ │ │ ├── ClickhouseSourceReader.java │ │ │ │ ├── ClickhouseSourceTable.java │ │ │ │ ├── ClickhouseValueReader.java │ │ │ │ └── split/ │ │ │ │ ├── ClickhouseSourceSplit.java │ │ │ │ ├── ClickhouseSourceSplitEnumerator.java │ │ │ │ ├── PartStrategySplitter.java │ │ │ │ ├── Splitter.java │ │ │ │ └── SqlStrategySplitter.java │ │ │ ├── state/ │ │ │ │ ├── CKAggCommitInfo.java │ │ │ │ ├── CKCommitInfo.java │ │ │ │ ├── CKFileAggCommitInfo.java │ │ │ │ ├── CKFileCommitInfo.java │ │ │ │ ├── ClickhouseSinkState.java │ │ │ │ └── ClickhouseSourceState.java │ │ │ └── util/ │ │ │ ├── ClickhouseCatalogUtil.java │ │ │ ├── ClickhouseProxy.java │ │ │ ├── ClickhouseUtil.java │ │ │ ├── CreateTableParser.java │ │ │ ├── DistributedEngine.java │ │ │ ├── IntHolder.java │ │ │ └── TypeConvertUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── clickhouse/ │ │ ├── ClickhouseCreateTableTest.java │ │ ├── ClickhouseFactoryTest.java │ │ ├── ShardRouterTest.java │ │ ├── source/ │ │ │ ├── ClickhouseValueReaderTest.java │ │ │ └── split/ │ │ │ └── PartStrategySplitterTest.java │ │ └── util/ │ │ ├── ClickhouseCatalogUtilTest.java │ │ └── ClickhouseUtilTest.java │ ├── connector-common/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── common/ │ │ │ │ ├── sink/ │ │ │ │ │ ├── AbstractSimpleSink.java │ │ │ │ │ └── AbstractSinkWriter.java │ │ │ │ ├── source/ │ │ │ │ │ ├── AbstractSingleSplitReader.java │ │ │ │ │ ├── AbstractSingleSplitSource.java │ │ │ │ │ ├── SingleSplit.java │ │ │ │ │ ├── SingleSplitEnumerator.java │ │ │ │ │ ├── SingleSplitEnumeratorState.java │ │ │ │ │ ├── SingleSplitReaderContext.java │ │ │ │ │ ├── TypeDefineUtils.java │ │ │ │ │ ├── arrow/ │ │ │ │ │ │ ├── converter/ │ │ │ │ │ │ │ ├── Converter.java │ │ │ │ │ │ │ ├── DateMilliConvertor.java │ │ │ │ │ │ │ ├── DefaultConverter.java │ │ │ │ │ │ │ ├── FixedSizeListConverter.java │ │ │ │ │ │ │ ├── LargeListConverter.java │ │ │ │ │ │ │ ├── ListConverter.java │ │ │ │ │ │ │ ├── MapConverter.java │ │ │ │ │ │ │ ├── NullConverter.java │ │ │ │ │ │ │ ├── StructConverter.java │ │ │ │ │ │ │ ├── TimeStampMicroConverter.java │ │ │ │ │ │ │ ├── TimeStampMilliConverter.java │ │ │ │ │ │ │ ├── TimeStampNanoConverter.java │ │ │ │ │ │ │ └── TimeStampSecConverter.java │ │ │ │ │ │ └── reader/ │ │ │ │ │ │ └── ArrowToSeatunnelRowReader.java │ │ │ │ │ └── reader/ │ │ │ │ │ ├── RecordEmitter.java │ │ │ │ │ ├── RecordsBySplits.java │ │ │ │ │ ├── RecordsWithSplitIds.java │ │ │ │ │ ├── SingleThreadMultiplexSourceReaderBase.java │ │ │ │ │ ├── SourceReaderBase.java │ │ │ │ │ ├── SourceReaderOptions.java │ │ │ │ │ ├── fetcher/ │ │ │ │ │ │ ├── AddSplitsTask.java │ │ │ │ │ │ ├── FetchTask.java │ │ │ │ │ │ ├── SingleThreadFetcherManager.java │ │ │ │ │ │ ├── SplitFetcher.java │ │ │ │ │ │ ├── SplitFetcherManager.java │ │ │ │ │ │ └── SplitFetcherTask.java │ │ │ │ │ └── splitreader/ │ │ │ │ │ ├── SplitReader.java │ │ │ │ │ ├── SplitsAddition.java │ │ │ │ │ └── SplitsChange.java │ │ │ │ ├── sql/ │ │ │ │ │ └── template/ │ │ │ │ │ └── SqlTemplate.java │ │ │ │ └── util/ │ │ │ │ ├── CatalogUtil.java │ │ │ │ └── CreateTableParser.java │ │ │ └── resources/ │ │ │ └── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.seatunnel.connectors.seatunnel.common.source.arrow.converter.Converter │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ ├── common/ │ │ │ └── source/ │ │ │ └── arrow/ │ │ │ ├── ArrowToSeatunnelRowReaderTest.java │ │ │ └── SeaTunnelDataTypeHolder.java │ │ ├── sink/ │ │ │ └── SinkFlowTestUtils.java │ │ └── source/ │ │ └── SourceFlowTestUtils.java │ ├── connector-console/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── console/ │ │ │ └── sink/ │ │ │ ├── ConsoleSink.java │ │ │ ├── ConsoleSinkFactory.java │ │ │ ├── ConsoleSinkOptions.java │ │ │ └── ConsoleSinkWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── console/ │ │ ├── ConsoleFactoryTest.java │ │ └── sink/ │ │ └── ConsoleSinkWriterTest.java │ ├── connector-databend/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── databend/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── DatabendCatalog.java │ │ │ │ │ └── DatabendCatalogFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── DatabendOptions.java │ │ │ │ │ ├── DatabendSinkConfig.java │ │ │ │ │ ├── DatabendSinkOptions.java │ │ │ │ │ ├── DatabendSourceConfig.java │ │ │ │ │ └── DatabendSourceOptions.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── DatabendConnectorErrorCode.java │ │ │ │ │ └── DatabendConnectorException.java │ │ │ │ ├── schema/ │ │ │ │ │ └── SchemaChangeManager.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── DatabendSink.java │ │ │ │ │ ├── DatabendSinkAggregatedCommitInfo.java │ │ │ │ │ ├── DatabendSinkAggregatedCommitter.java │ │ │ │ │ ├── DatabendSinkCommitterInfo.java │ │ │ │ │ ├── DatabendSinkFactory.java │ │ │ │ │ └── DatabendSinkWriter.java │ │ │ │ ├── source/ │ │ │ │ │ ├── DatabendSource.java │ │ │ │ │ ├── DatabendSourceFactory.java │ │ │ │ │ └── DatabendSourceReader.java │ │ │ │ ├── state/ │ │ │ │ │ ├── DatabendSinkState.java │ │ │ │ │ └── DatabendSourceState.java │ │ │ │ └── util/ │ │ │ │ ├── DatabendTypeConverter.java │ │ │ │ └── DatabendUtil.java │ │ │ └── resources/ │ │ │ ├── databend_sink_example.conf │ │ │ ├── databend_source_example.conf │ │ │ ├── databend_to_databend_example.conf │ │ │ └── mysql_to_databend_example.conf │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── databend/ │ │ ├── DatabendFactoryTest.java │ │ └── sink/ │ │ └── DatabendSinkWriterTest.java │ ├── connector-datahub/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── datahub/ │ │ │ ├── config/ │ │ │ │ └── DataHubSinkOptions.java │ │ │ ├── exception/ │ │ │ │ └── DataHubConnectorException.java │ │ │ └── sink/ │ │ │ ├── DataHubSink.java │ │ │ ├── DataHubSinkFactory.java │ │ │ └── DataHubWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── datahub/ │ │ └── DataHubFactoryTest.java │ ├── connector-dingtalk/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ ├── config/ │ │ │ │ └── DingTalkSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── DingTalkConnectorErrorCode.java │ │ │ │ └── DingTalkConnectorException.java │ │ │ └── sink/ │ │ │ ├── DingTalkSink.java │ │ │ ├── DingTalkSinkFactory.java │ │ │ └── DingTalkWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── DingTalkFactoryTest.java │ ├── connector-doris/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── doris/ │ │ │ ├── backend/ │ │ │ │ └── BackendClient.java │ │ │ ├── catalog/ │ │ │ │ ├── DorisCatalog.java │ │ │ │ └── DorisCatalogFactory.java │ │ │ ├── config/ │ │ │ │ ├── DorisBaseOptions.java │ │ │ │ ├── DorisSinkConfig.java │ │ │ │ ├── DorisSinkOptions.java │ │ │ │ ├── DorisSourceConfig.java │ │ │ │ ├── DorisSourceOptions.java │ │ │ │ └── DorisTableConfig.java │ │ │ ├── datatype/ │ │ │ │ ├── AbstractDorisTypeConverter.java │ │ │ │ ├── DorisTypeConverterFactory.java │ │ │ │ ├── DorisTypeConverterV1.java │ │ │ │ └── DorisTypeConverterV2.java │ │ │ ├── exception/ │ │ │ │ ├── DorisConnectorErrorCode.java │ │ │ │ ├── DorisConnectorException.java │ │ │ │ └── DorisSchemaChangeException.java │ │ │ ├── rest/ │ │ │ │ ├── PartitionDefinition.java │ │ │ │ ├── RestService.java │ │ │ │ └── models/ │ │ │ │ ├── Field.java │ │ │ │ ├── QueryPlan.java │ │ │ │ ├── RespContent.java │ │ │ │ ├── Schema.java │ │ │ │ └── Tablet.java │ │ │ ├── schema/ │ │ │ │ └── SchemaChangeManager.java │ │ │ ├── serialize/ │ │ │ │ ├── DorisSerializer.java │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ └── SeaTunnelRowSerializerFactory.java │ │ │ ├── sink/ │ │ │ │ ├── DorisSink.java │ │ │ │ ├── DorisSinkFactory.java │ │ │ │ ├── HttpPutBuilder.java │ │ │ │ ├── LoadStatus.java │ │ │ │ ├── committer/ │ │ │ │ │ ├── DorisCommitInfo.java │ │ │ │ │ ├── DorisCommitInfoSerializer.java │ │ │ │ │ └── DorisCommitter.java │ │ │ │ └── writer/ │ │ │ │ ├── DorisSinkState.java │ │ │ │ ├── DorisSinkStateSerializer.java │ │ │ │ ├── DorisSinkWriter.java │ │ │ │ ├── DorisStreamLoad.java │ │ │ │ ├── LabelGenerator.java │ │ │ │ ├── LoadConstants.java │ │ │ │ ├── RecordBuffer.java │ │ │ │ └── RecordStream.java │ │ │ ├── source/ │ │ │ │ ├── DorisSource.java │ │ │ │ ├── DorisSourceFactory.java │ │ │ │ ├── DorisSourceState.java │ │ │ │ ├── DorisSourceTable.java │ │ │ │ ├── reader/ │ │ │ │ │ ├── DorisSourceReader.java │ │ │ │ │ └── DorisValueReader.java │ │ │ │ ├── serialization/ │ │ │ │ │ └── Routing.java │ │ │ │ └── split/ │ │ │ │ ├── DorisSourceSplit.java │ │ │ │ └── DorisSourceSplitEnumerator.java │ │ │ └── util/ │ │ │ ├── DorisCatalogUtil.java │ │ │ ├── ErrorMessages.java │ │ │ ├── HttpUtil.java │ │ │ ├── ResponseUtil.java │ │ │ ├── SchemaUtils.java │ │ │ └── UnsupportedTypeConverterUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── doris/ │ │ ├── catalog/ │ │ │ ├── DorisCreateTableTest.java │ │ │ └── PreviewActionTest.java │ │ ├── datatype/ │ │ │ ├── DorisTypeConvertorV1Test.java │ │ │ └── DorisTypeConvertorV2Test.java │ │ ├── split/ │ │ │ └── DorisSourceSplitEnumeratorTest.java │ │ └── util/ │ │ └── DorisCatalogUtilTest.java │ ├── connector-druid/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── druid/ │ │ │ ├── config/ │ │ │ │ └── DruidSinkOptions.java │ │ │ ├── exception/ │ │ │ │ └── DruidConnectorException.java │ │ │ └── sink/ │ │ │ ├── DruidSink.java │ │ │ ├── DruidSinkFactory.java │ │ │ └── DruidWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── druid/ │ │ └── DruidFactoryTest.java │ ├── connector-easysearch/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── easysearch/ │ │ │ ├── catalog/ │ │ │ │ ├── EasysearchCatalog.java │ │ │ │ ├── EasysearchCatalogFactory.java │ │ │ │ └── EasysearchDataTypeConvertor.java │ │ │ ├── client/ │ │ │ │ └── EasysearchClient.java │ │ │ ├── config/ │ │ │ │ ├── EasysearchSinkCommonOptions.java │ │ │ │ ├── EasysearchSinkOptions.java │ │ │ │ └── EasysearchSourceOptions.java │ │ │ ├── constant/ │ │ │ │ └── EzsTypeMappingSeaTunnelType.java │ │ │ ├── dto/ │ │ │ │ ├── BulkResponse.java │ │ │ │ ├── EasysearchClusterInfo.java │ │ │ │ ├── IndexInfo.java │ │ │ │ └── source/ │ │ │ │ ├── IndexDocsCount.java │ │ │ │ ├── ScrollResult.java │ │ │ │ └── SourceIndexInfo.java │ │ │ ├── exception/ │ │ │ │ ├── EasysearchConnectorErrorCode.java │ │ │ │ └── EasysearchConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── EasysearchRowSerializer.java │ │ │ │ ├── KeyExtractor.java │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ ├── index/ │ │ │ │ │ ├── IndexSerializer.java │ │ │ │ │ ├── IndexSerializerFactory.java │ │ │ │ │ └── impl/ │ │ │ │ │ ├── FixedValueIndexSerializer.java │ │ │ │ │ └── VariableIndexSerializer.java │ │ │ │ └── source/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── EasysearchRecord.java │ │ │ │ └── SeaTunnelRowDeserializer.java │ │ │ ├── sink/ │ │ │ │ ├── EasysearchSink.java │ │ │ │ ├── EasysearchSinkFactory.java │ │ │ │ └── EasysearchSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── EasysearchSource.java │ │ │ │ ├── EasysearchSourceFactory.java │ │ │ │ ├── EasysearchSourceReader.java │ │ │ │ ├── EasysearchSourceSplit.java │ │ │ │ ├── EasysearchSourceSplitEnumerator.java │ │ │ │ └── EasysearchSourceState.java │ │ │ ├── state/ │ │ │ │ ├── EasysearchAggregatedCommitInfo.java │ │ │ │ ├── EasysearchCommitInfo.java │ │ │ │ └── EasysearchSinkState.java │ │ │ └── util/ │ │ │ ├── RegexUtils.java │ │ │ └── SSLUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── easysearch/ │ │ ├── EasysearchFactoryTest.java │ │ └── EasysearchSourceTest.java │ ├── connector-elasticsearch/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── elasticsearch/ │ │ │ ├── catalog/ │ │ │ │ ├── ElasticSearchCatalog.java │ │ │ │ ├── ElasticSearchCatalogFactory.java │ │ │ │ ├── ElasticSearchDataTypeConvertor.java │ │ │ │ └── ElasticSearchTypeConverter.java │ │ │ ├── client/ │ │ │ │ ├── EsRestClient.java │ │ │ │ ├── EsType.java │ │ │ │ └── auth/ │ │ │ │ ├── AbstractAuthenticationProvider.java │ │ │ │ ├── ApiKeyAuthProvider.java │ │ │ │ ├── ApiKeyEncodedAuthProvider.java │ │ │ │ ├── AuthenticationProvider.java │ │ │ │ ├── AuthenticationProviderFactory.java │ │ │ │ └── BasicAuthProvider.java │ │ │ ├── config/ │ │ │ │ ├── AuthTypeEnum.java │ │ │ │ ├── ElasticsearchBaseOptions.java │ │ │ │ ├── ElasticsearchConfig.java │ │ │ │ ├── ElasticsearchSinkOptions.java │ │ │ │ ├── ElasticsearchSourceOptions.java │ │ │ │ ├── SearchApiTypeEnum.java │ │ │ │ └── SearchTypeEnum.java │ │ │ ├── constant/ │ │ │ │ ├── ElasticsearchVersion.java │ │ │ │ └── EsTypeMappingSeaTunnelType.java │ │ │ ├── dto/ │ │ │ │ ├── BulkResponse.java │ │ │ │ ├── ElasticsearchClusterInfo.java │ │ │ │ ├── IndexInfo.java │ │ │ │ └── source/ │ │ │ │ ├── IndexDocsCount.java │ │ │ │ ├── PointInTimeResult.java │ │ │ │ └── ScrollResult.java │ │ │ ├── exception/ │ │ │ │ ├── ElasticsearchConnectorErrorCode.java │ │ │ │ └── ElasticsearchConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── ElasticsearchRowSerializer.java │ │ │ │ ├── KeyExtractor.java │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ ├── index/ │ │ │ │ │ ├── IndexSerializer.java │ │ │ │ │ ├── IndexSerializerFactory.java │ │ │ │ │ └── impl/ │ │ │ │ │ ├── FixedValueIndexSerializer.java │ │ │ │ │ └── VariableIndexSerializer.java │ │ │ │ ├── source/ │ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ │ ├── ElasticsearchRecord.java │ │ │ │ │ └── SeaTunnelRowDeserializer.java │ │ │ │ └── type/ │ │ │ │ ├── IndexTypeSerializer.java │ │ │ │ ├── IndexTypeSerializerFactory.java │ │ │ │ └── impl/ │ │ │ │ ├── NotIndexTypeSerializer.java │ │ │ │ └── RequiredIndexTypeSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── ElasticsearchSink.java │ │ │ │ ├── ElasticsearchSinkFactory.java │ │ │ │ └── ElasticsearchSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── ElasticsearchSource.java │ │ │ │ ├── ElasticsearchSourceFactory.java │ │ │ │ ├── ElasticsearchSourceReader.java │ │ │ │ ├── ElasticsearchSourceSplit.java │ │ │ │ ├── ElasticsearchSourceSplitEnumerator.java │ │ │ │ └── ElasticsearchSourceState.java │ │ │ ├── state/ │ │ │ │ ├── ElasticsearchAggregatedCommitInfo.java │ │ │ │ ├── ElasticsearchCommitInfo.java │ │ │ │ └── ElasticsearchSinkState.java │ │ │ └── util/ │ │ │ ├── RegexUtils.java │ │ │ └── SSLUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── elasticsearch/ │ │ ├── ElasticsearchFactoryTest.java │ │ ├── ElasticsearchSourceTest.java │ │ ├── catalog/ │ │ │ └── PreviewActionTest.java │ │ └── serialize/ │ │ └── ElasticsearchRowSerializerTest.java │ ├── connector-email/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── email/ │ │ │ │ ├── config/ │ │ │ │ │ ├── EmailSinkConfig.java │ │ │ │ │ └── EmailSinkOptions.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── EmailConnectorErrorCode.java │ │ │ │ │ └── EmailConnectorException.java │ │ │ │ └── sink/ │ │ │ │ ├── EmailSink.java │ │ │ │ ├── EmailSinkFactory.java │ │ │ │ └── EmailSinkWriter.java │ │ │ └── resources/ │ │ │ └── fake_to_emailsink_flink.conf │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── email/ │ │ ├── EmailFactoryTest.java │ │ └── EmailSinkWriterTest.java │ ├── connector-fake/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── fake/ │ │ │ ├── config/ │ │ │ │ ├── FakeConfig.java │ │ │ │ ├── FakeSourceOptions.java │ │ │ │ └── MultipleTableFakeSourceConfig.java │ │ │ ├── exception/ │ │ │ │ └── FakeConnectorException.java │ │ │ ├── source/ │ │ │ │ ├── FakeDataGenerator.java │ │ │ │ ├── FakeSource.java │ │ │ │ ├── FakeSourceFactory.java │ │ │ │ ├── FakeSourceReader.java │ │ │ │ ├── FakeSourceSplit.java │ │ │ │ └── FakeSourceSplitEnumerator.java │ │ │ ├── state/ │ │ │ │ └── FakeSourceState.java │ │ │ └── utils/ │ │ │ ├── AutoIncrementIdGenerator.java │ │ │ ├── FakeDataRandomUtils.java │ │ │ └── IdGeneratorUtils.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── fake/ │ │ │ ├── config/ │ │ │ │ └── MultipleTableFakeSourceConfigTest.java │ │ │ └── source/ │ │ │ ├── FakeDataGeneratorTest.java │ │ │ ├── FakeFactoryTest.java │ │ │ └── FakeSourceSplitEnumeratorTest.java │ │ └── resources/ │ │ ├── complex.schema.conf │ │ ├── fake-auto-increment-id.conf │ │ ├── fake-data.column.conf │ │ ├── fake-data.schema.conf │ │ ├── fake-data.schema.default.conf │ │ ├── fake-vector.conf │ │ ├── multiple_table.conf │ │ └── simple.schema.conf │ ├── connector-file/ │ │ ├── connector-file-base/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── file/ │ │ │ │ ├── catalog/ │ │ │ │ │ └── AbstractFileCatalog.java │ │ │ │ ├── config/ │ │ │ │ │ ├── ArchiveCompressFormat.java │ │ │ │ │ ├── BaseFileSinkConfig.java │ │ │ │ │ ├── BaseFileSourceConfig.java │ │ │ │ │ ├── BaseMultipleTableFileSourceConfig.java │ │ │ │ │ ├── CompressFormat.java │ │ │ │ │ ├── DelimiterConfig.java │ │ │ │ │ ├── ExcelEngine.java │ │ │ │ │ ├── FileBaseOptions.java │ │ │ │ │ ├── FileBaseSinkOptions.java │ │ │ │ │ ├── FileBaseSourceOptions.java │ │ │ │ │ ├── FileCompareMode.java │ │ │ │ │ ├── FileFormat.java │ │ │ │ │ ├── FileSyncMode.java │ │ │ │ │ ├── FileSystemType.java │ │ │ │ │ ├── FileUpdateStrategy.java │ │ │ │ │ ├── HadoopConf.java │ │ │ │ │ └── PartitionConfig.java │ │ │ │ ├── excel/ │ │ │ │ │ ├── ExcelCellUtils.java │ │ │ │ │ └── ExcelReaderListener.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── FileConnectorErrorCode.java │ │ │ │ │ └── FileConnectorException.java │ │ │ │ ├── factory/ │ │ │ │ │ └── BaseMultipleTableFileSinkFactory.java │ │ │ │ ├── hadoop/ │ │ │ │ │ ├── HadoopFileSystemProxy.java │ │ │ │ │ └── HadoopLoginFactory.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── BaseFileSink.java │ │ │ │ │ ├── BaseFileSinkWriter.java │ │ │ │ │ ├── BaseMultipleTableFileSink.java │ │ │ │ │ ├── commit/ │ │ │ │ │ │ ├── FileAggregatedCommitInfo.java │ │ │ │ │ │ ├── FileCommitInfo.java │ │ │ │ │ │ └── FileSinkAggregatedCommitter.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── FileSinkConfig.java │ │ │ │ │ │ └── SaveMode.java │ │ │ │ │ ├── state/ │ │ │ │ │ │ └── FileSinkState.java │ │ │ │ │ ├── util/ │ │ │ │ │ │ ├── ExcelGenerator.java │ │ │ │ │ │ └── XmlWriter.java │ │ │ │ │ └── writer/ │ │ │ │ │ ├── AbstractWriteStrategy.java │ │ │ │ │ ├── BinaryWriteStrategy.java │ │ │ │ │ ├── CanalJsonWriteStrategy.java │ │ │ │ │ ├── CsvWriteStrategy.java │ │ │ │ │ ├── DebeziumJsonWriteStrategy.java │ │ │ │ │ ├── ExcelWriteStrategy.java │ │ │ │ │ ├── JsonWriteStrategy.java │ │ │ │ │ ├── MaxWellJsonWriteStrategy.java │ │ │ │ │ ├── OrcWriteStrategy.java │ │ │ │ │ ├── ParquetWriteStrategy.java │ │ │ │ │ ├── TextWriteStrategy.java │ │ │ │ │ ├── Transaction.java │ │ │ │ │ ├── WriteStrategy.java │ │ │ │ │ ├── WriteStrategyFactory.java │ │ │ │ │ └── XmlWriteStrategy.java │ │ │ │ └── source/ │ │ │ │ ├── BaseFileSource.java │ │ │ │ ├── BaseFileSourceReader.java │ │ │ │ ├── BaseMultipleTableFileSource.java │ │ │ │ ├── reader/ │ │ │ │ │ ├── AbstractReadStrategy.java │ │ │ │ │ ├── BinaryReadStrategy.java │ │ │ │ │ ├── CsvReadStrategy.java │ │ │ │ │ ├── ExcelReadStrategy.java │ │ │ │ │ ├── JsonReadStrategy.java │ │ │ │ │ ├── MarkdownReadStrategy.java │ │ │ │ │ ├── MultipleTableFileSourceReader.java │ │ │ │ │ ├── OrcReadStrategy.java │ │ │ │ │ ├── ParquetReadStrategy.java │ │ │ │ │ ├── ReadStrategy.java │ │ │ │ │ ├── ReadStrategyFactory.java │ │ │ │ │ ├── TextReadStrategy.java │ │ │ │ │ └── XmlReadStrategy.java │ │ │ │ ├── split/ │ │ │ │ │ ├── AccordingToSplitSizeSplitStrategy.java │ │ │ │ │ ├── DefaultFileSplitStrategy.java │ │ │ │ │ ├── FileSourceSplit.java │ │ │ │ │ ├── FileSourceSplitEnumerator.java │ │ │ │ │ ├── FileSplitStrategy.java │ │ │ │ │ ├── FileSplitStrategyFactory.java │ │ │ │ │ ├── MultipleTableFileSourceSplitEnumerator.java │ │ │ │ │ ├── MultipleTableFileSplitStrategy.java │ │ │ │ │ └── ParquetFileSplitStrategy.java │ │ │ │ └── state/ │ │ │ │ └── FileSourceState.java │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── file/ │ │ │ │ ├── hadoop/ │ │ │ │ │ ├── HadoopFileSystemProxyKerberosRenewTest.java │ │ │ │ │ └── HadoopLoginFactoryTest.java │ │ │ │ ├── reader/ │ │ │ │ │ ├── BinaryReadStrategyTest.java │ │ │ │ │ ├── ExcelReadStrategyTest.java │ │ │ │ │ ├── FileFilterPatternTest.java │ │ │ │ │ └── StreamLineSplitterTest.java │ │ │ │ ├── source/ │ │ │ │ │ ├── reader/ │ │ │ │ │ │ ├── AbstractReadStrategyTest.java │ │ │ │ │ │ ├── CsvReadStrategyTest.java │ │ │ │ │ │ ├── MarkdownReadStrategyTest.java │ │ │ │ │ │ ├── ReadStrategySplitFallbackTest.java │ │ │ │ │ │ ├── TempCollector.java │ │ │ │ │ │ └── UpdateSyncModeTest.java │ │ │ │ │ └── split/ │ │ │ │ │ ├── FileSourceSplitCompatibilityTest.java │ │ │ │ │ ├── FileSourceSplitEnumeratorTest.java │ │ │ │ │ ├── FileSplitStrategyFactoryTest.java │ │ │ │ │ ├── MultipleTableFileSourceSplitEnumeratorTest.java │ │ │ │ │ └── ParquetFileSplitStrategyTest.java │ │ │ │ ├── util/ │ │ │ │ │ └── FileSystemUtilsTest.java │ │ │ │ └── writer/ │ │ │ │ ├── CsvWriteStrategyTest.java │ │ │ │ ├── ExcelGeneratorTest.java │ │ │ │ ├── FileSinkConfigTest.java │ │ │ │ ├── OrcReadStrategyTest.java │ │ │ │ ├── OrcWriteStrategyTest.java │ │ │ │ ├── ParquetReadStrategyTest.java │ │ │ │ ├── ParquetWriteStrategyTest.java │ │ │ │ ├── ReadStrategyEncodingTest.java │ │ │ │ └── XmlReadStrategyTest.java │ │ │ └── resources/ │ │ │ ├── csv/ │ │ │ │ ├── special_quote_char_break_line.csv │ │ │ │ ├── utf8_bom_with_header.csv │ │ │ │ └── utf8_bom_without_header.csv │ │ │ ├── encoding/ │ │ │ │ ├── gbk.json │ │ │ │ ├── gbk.txt │ │ │ │ ├── gbk.xml │ │ │ │ ├── gbk_use_attr_format.xml │ │ │ │ ├── test_read_json.conf │ │ │ │ ├── test_read_text.conf │ │ │ │ ├── test_read_xml.conf │ │ │ │ └── test_read_xml_use_attr_format.conf │ │ │ ├── excel/ │ │ │ │ ├── e2e.xls │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── e2exls.conf │ │ │ │ ├── test_read_excel.conf │ │ │ │ ├── test_read_excel.xlsx │ │ │ │ ├── test_read_excel_data_string.conf │ │ │ │ ├── test_read_excel_date_string.xlsx │ │ │ │ ├── test_read_excel_formula.xlsx │ │ │ │ ├── test_read_excel_large.conf │ │ │ │ └── test_read_formula.xls │ │ │ ├── filter-pattern/ │ │ │ │ └── json/ │ │ │ │ ├── json2024/ │ │ │ │ │ └── 202401.json │ │ │ │ ├── json2025/ │ │ │ │ │ ├── 202501.json │ │ │ │ │ └── test_read_json.conf │ │ │ │ └── people.json │ │ │ ├── hive.parquet │ │ │ ├── test-csv.csv │ │ │ ├── test.csv │ │ │ ├── test.md │ │ │ ├── test.orc │ │ │ ├── test_read_orc.conf │ │ │ ├── test_read_parquet.conf │ │ │ ├── test_read_parquet2.conf │ │ │ ├── test_user_config_read_parquet.conf │ │ │ ├── test_write_hdfs.conf │ │ │ ├── test_write_hdfs_default_format.conf │ │ │ ├── test_write_hive.conf │ │ │ ├── timestamp_as_int64.parquet │ │ │ ├── timestamp_as_int96.parquet │ │ │ └── xml/ │ │ │ ├── name=xmlTest/ │ │ │ │ └── test_read.xml │ │ │ └── test_read_xml.conf │ │ ├── connector-file-base-hadoop/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── hdfs/ │ │ │ ├── sink/ │ │ │ │ └── BaseHdfsFileSink.java │ │ │ └── source/ │ │ │ ├── BaseHdfsFileSource.java │ │ │ └── config/ │ │ │ └── HdfsSourceConfigOptions.java │ │ ├── connector-file-cos/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── cos/ │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── CosConf.java │ │ │ │ │ │ ├── CosFileBaseOptions.java │ │ │ │ │ │ ├── CosFileSinkOptions.java │ │ │ │ │ │ └── CosFileSourceOptions.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── CosFileSink.java │ │ │ │ │ │ └── CosFileSinkFactory.java │ │ │ │ │ └── source/ │ │ │ │ │ ├── CosFileSource.java │ │ │ │ │ └── CosFileSourceFactory.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── cos/ │ │ │ └── CosFileFactoryTest.java │ │ ├── connector-file-ftp/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── ftp/ │ │ │ │ │ ├── catalog/ │ │ │ │ │ │ ├── FtpFileCatalog.java │ │ │ │ │ │ └── FtpFileCatalogFactory.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── FTPFileSourceConfig.java │ │ │ │ │ │ ├── FtpConf.java │ │ │ │ │ │ ├── FtpFileBaseOptions.java │ │ │ │ │ │ ├── FtpFileSinkOptions.java │ │ │ │ │ │ ├── FtpFileSourceOptions.java │ │ │ │ │ │ └── MultipleTableFTPFileSourceConfig.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── FtpFileSink.java │ │ │ │ │ │ └── FtpFileSinkFactory.java │ │ │ │ │ ├── source/ │ │ │ │ │ │ ├── FtpFileSource.java │ │ │ │ │ │ └── FtpFileSourceFactory.java │ │ │ │ │ └── system/ │ │ │ │ │ ├── FtpConnectionMode.java │ │ │ │ │ └── SeaTunnelFTPFileSystem.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── ftp/ │ │ │ ├── FtpFileFactoryTest.java │ │ │ └── system/ │ │ │ └── SeaTunnelFTPFileSystemTest.java │ │ ├── connector-file-hadoop/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── file/ │ │ │ │ └── hdfs/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── HdfsFileCatalog.java │ │ │ │ │ └── HdfsFileCatalogFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── HdfsFileHadoopConfig.java │ │ │ │ │ ├── HdfsFileSinkOptions.java │ │ │ │ │ ├── HdfsFileSourceConfig.java │ │ │ │ │ └── MultipleTableHdfsFileSourceConfig.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── HdfsFileSink.java │ │ │ │ │ └── HdfsFileSinkFactory.java │ │ │ │ └── source/ │ │ │ │ ├── HdfsFileSource.java │ │ │ │ └── HdfsFileSourceFactory.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── hdfs/ │ │ │ ├── HdfsFileFactoryTest.java │ │ │ ├── HdfsFileSinkTest.java │ │ │ ├── HdfsFileSourceConfigTest.java │ │ │ └── source/ │ │ │ └── split/ │ │ │ ├── HdfsFileAccordingToSplitSizeSplitStrategyTest.java │ │ │ └── HdfsFileSplitStrategyFactoryTest.java │ │ ├── connector-file-jindo-oss/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── oss/ │ │ │ │ │ └── jindo/ │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── OssConf.java │ │ │ │ │ │ ├── OssFileBaseOptions.java │ │ │ │ │ │ ├── OssFileSinkOptions.java │ │ │ │ │ │ └── OssFileSourceOptions.java │ │ │ │ │ ├── exception/ │ │ │ │ │ │ └── OssJindoConnectorException.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── OssFileSink.java │ │ │ │ │ │ └── OssFileSinkFactory.java │ │ │ │ │ └── source/ │ │ │ │ │ ├── OssFileSource.java │ │ │ │ │ └── OssFileSourceFactory.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── test/ │ │ │ └── OssJindoFactoryTest.java │ │ ├── connector-file-local/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── file/ │ │ │ │ └── local/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── LocalFileCatalog.java │ │ │ │ │ └── LocalFileCatalogFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── LocalFileHadoopConf.java │ │ │ │ │ ├── LocalFileSinkOptions.java │ │ │ │ │ └── LocalFileSourceOptions.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── LocalFileSink.java │ │ │ │ │ └── LocalFileSinkFactory.java │ │ │ │ └── source/ │ │ │ │ ├── LocalFileSource.java │ │ │ │ ├── LocalFileSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── LocalFileSourceConfig.java │ │ │ │ │ └── MultipleTableLocalFileSourceConfig.java │ │ │ │ └── split/ │ │ │ │ └── LocalFileAccordingToSplitSizeSplitStrategy.java │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── file/ │ │ │ │ └── local/ │ │ │ │ ├── LocalFileFactoryTest.java │ │ │ │ ├── LocalFileSourceTest.java │ │ │ │ ├── LocalFileTest.java │ │ │ │ └── SplitFileStrategyTest.java │ │ │ └── resources/ │ │ │ ├── test_data.txt │ │ │ ├── test_split_csv_data.csv │ │ │ ├── test_split_empty_data.csv │ │ │ ├── test_split_special_row_delimiter_data.txt │ │ │ └── utf8_bom_split.csv │ │ ├── connector-file-obs/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── obs/ │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── ObsConf.java │ │ │ │ │ │ ├── ObsFileBaseOptions.java │ │ │ │ │ │ ├── ObsFileSinkOptions.java │ │ │ │ │ │ └── ObsFileSourceOptions.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── ObsFileSink.java │ │ │ │ │ │ └── ObsFileSinkFactory.java │ │ │ │ │ └── source/ │ │ │ │ │ ├── ObsFileSource.java │ │ │ │ │ └── ObsFileSourceFactory.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── obs/ │ │ │ └── ObsFileFactoryTest.java │ │ ├── connector-file-oss/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── oss/ │ │ │ │ │ ├── catalog/ │ │ │ │ │ │ ├── OssFileCatalog.java │ │ │ │ │ │ └── OssFileCatalogFactory.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── OssFileBaseOptions.java │ │ │ │ │ │ ├── OssFileSinkOptions.java │ │ │ │ │ │ ├── OssFileSourceOptions.java │ │ │ │ │ │ └── OssHadoopConf.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── OssFileSink.java │ │ │ │ │ │ └── OssFileSinkFactory.java │ │ │ │ │ └── source/ │ │ │ │ │ ├── OssFileSource.java │ │ │ │ │ ├── OssFileSourceFactory.java │ │ │ │ │ └── config/ │ │ │ │ │ ├── MultipleTableOssFileSourceConfig.java │ │ │ │ │ └── OssFileSourceConfig.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── oss/ │ │ │ └── OssFileFactoryTest.java │ │ ├── connector-file-s3/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── s3/ │ │ │ │ │ ├── catalog/ │ │ │ │ │ │ ├── S3FileCatalog.java │ │ │ │ │ │ └── S3FileCatalogFactory.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── S3FileBaseOptions.java │ │ │ │ │ │ ├── S3FileSinkOptions.java │ │ │ │ │ │ ├── S3FileSourceOptions.java │ │ │ │ │ │ └── S3HadoopConf.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── S3FileSink.java │ │ │ │ │ │ └── S3FileSinkFactory.java │ │ │ │ │ └── source/ │ │ │ │ │ ├── S3FileSource.java │ │ │ │ │ ├── S3FileSourceFactory.java │ │ │ │ │ └── config/ │ │ │ │ │ ├── MultipleTableS3FileSourceConfig.java │ │ │ │ │ └── S3FileSourceConfig.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── s3/ │ │ │ ├── S3FileFactoryTest.java │ │ │ └── config/ │ │ │ └── S3HadoopConfTest.java │ │ ├── connector-file-sftp/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── file/ │ │ │ │ │ └── sftp/ │ │ │ │ │ ├── catalog/ │ │ │ │ │ │ ├── SftpFileCatalog.java │ │ │ │ │ │ └── SftpFileCatalogFactory.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── MultipleTableSFTPFileSourceConfig.java │ │ │ │ │ │ ├── SFTPFileSourceConfig.java │ │ │ │ │ │ ├── SftpConf.java │ │ │ │ │ │ ├── SftpFileBaseOptions.java │ │ │ │ │ │ ├── SftpFileSinkOptions.java │ │ │ │ │ │ └── SftpFileSourceOptions.java │ │ │ │ │ ├── sink/ │ │ │ │ │ │ ├── SftpFileSink.java │ │ │ │ │ │ └── SftpFileSinkFactory.java │ │ │ │ │ ├── source/ │ │ │ │ │ │ ├── SftpFileSource.java │ │ │ │ │ │ └── SftpFileSourceFactory.java │ │ │ │ │ └── system/ │ │ │ │ │ ├── SFTPConnectionPool.java │ │ │ │ │ ├── SFTPFileSystem.java │ │ │ │ │ └── SFTPInputStream.java │ │ │ │ └── resources/ │ │ │ │ └── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── org.apache.hadoop.fs.FileSystem │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── file/ │ │ │ └── sftp/ │ │ │ ├── SftpFileFactoryTest.java │ │ │ └── system/ │ │ │ └── SftpFileSystemTest.java │ │ └── pom.xml │ ├── connector-fluss/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── fluss/ │ │ ├── config/ │ │ │ ├── FlussBaseOptions.java │ │ │ └── FlussSinkOptions.java │ │ └── sink/ │ │ ├── FlussSink.java │ │ ├── FlussSinkFactory.java │ │ └── FlussSinkWriter.java │ ├── connector-google-firestore/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── google/ │ │ │ └── firestore/ │ │ │ ├── config/ │ │ │ │ ├── FirestoreParameters.java │ │ │ │ └── FirestoreSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── FirestoreConnectorErrorCode.java │ │ │ │ └── FirestoreConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ └── sink/ │ │ │ ├── FirestoreSink.java │ │ │ ├── FirestoreSinkFactory.java │ │ │ └── FirestoreSinkWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── google/ │ │ └── firestore/ │ │ └── FirestoreFactoryTest.java │ ├── connector-google-sheets/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── google/ │ │ │ └── sheets/ │ │ │ ├── config/ │ │ │ │ ├── SheetsParameters.java │ │ │ │ └── SheetsSourceOptions.java │ │ │ ├── deserialize/ │ │ │ │ ├── GoogleSheetsDeserializer.java │ │ │ │ └── SeaTunnelRowDeserializer.java │ │ │ ├── exception/ │ │ │ │ ├── GoogleSheetsConnectorException.java │ │ │ │ ├── GoogleSheetsError.java │ │ │ │ └── GoogleSheetsErrorCode.java │ │ │ └── source/ │ │ │ ├── SheetsSource.java │ │ │ ├── SheetsSourceFactory.java │ │ │ └── SheetsSourceReader.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── google/ │ │ └── sheets/ │ │ ├── SheetsFactoryTest.java │ │ ├── deserialize/ │ │ │ └── GoogleSheetsDeserializerTest.java │ │ └── exception/ │ │ └── GoogleSheetsErrorTest.java │ ├── connector-graphql/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── graphql/ │ │ │ ├── Exception/ │ │ │ │ ├── GraphQLConnectorErrorCode.java │ │ │ │ └── GraphQLConnectorException.java │ │ │ ├── config/ │ │ │ │ ├── GraphQLSinkOptions.java │ │ │ │ ├── GraphQLSinkParameter.java │ │ │ │ ├── GraphQLSourceOptions.java │ │ │ │ └── GraphQLSourceParameter.java │ │ │ ├── sink/ │ │ │ │ ├── GraphQLSink.java │ │ │ │ ├── GraphQLSinkFactory.java │ │ │ │ └── GraphQLSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── GraphQLSource.java │ │ │ │ ├── GraphQLSourceFactory.java │ │ │ │ └── reader/ │ │ │ │ ├── GraphQLSourceHttpReader.java │ │ │ │ ├── GraphQLSourceSocketReader.java │ │ │ │ └── GraphQLWebSocket.java │ │ │ └── util/ │ │ │ └── GraphQLUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── graphql/ │ │ └── GraphQLFactoryTest.java │ ├── connector-hbase/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── hbase/ │ │ │ ├── catalog/ │ │ │ │ ├── HbaseCatalog.java │ │ │ │ └── HbaseCatalogFactory.java │ │ │ ├── client/ │ │ │ │ └── HbaseClient.java │ │ │ ├── config/ │ │ │ │ ├── HbaseBaseOptions.java │ │ │ │ ├── HbaseParameters.java │ │ │ │ ├── HbaseSinkOptions.java │ │ │ │ └── HbaseSourceOptions.java │ │ │ ├── constant/ │ │ │ │ └── HbaseIdentifier.java │ │ │ ├── exception/ │ │ │ │ ├── HbaseConnectorErrorCode.java │ │ │ │ └── HbaseConnectorException.java │ │ │ ├── format/ │ │ │ │ └── HBaseDeserializationFormat.java │ │ │ ├── sink/ │ │ │ │ ├── HbaseSink.java │ │ │ │ ├── HbaseSinkFactory.java │ │ │ │ └── HbaseSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── HbaseSource.java │ │ │ │ ├── HbaseSourceFactory.java │ │ │ │ ├── HbaseSourceReader.java │ │ │ │ ├── HbaseSourceSplit.java │ │ │ │ ├── HbaseSourceSplitEnumerator.java │ │ │ │ └── HbaseSourceState.java │ │ │ ├── state/ │ │ │ │ ├── HbaseAggregatedCommitInfo.java │ │ │ │ ├── HbaseCommitInfo.java │ │ │ │ └── HbaseSinkState.java │ │ │ └── util/ │ │ │ └── HBaseUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── hbase/ │ │ ├── HbaseCatalogTest.java │ │ ├── HbaseFactoryTest.java │ │ ├── client/ │ │ │ └── HbaseClientTest.java │ │ ├── config/ │ │ │ └── HbaseParametersTest.java │ │ ├── sink/ │ │ │ ├── HbaseSinkWriterTest.java │ │ │ └── HbaseSinkWriterTypeConvertTest.java │ │ └── source/ │ │ ├── HbaseSourceReaderTest.java │ │ └── HbaseSourceSplitEnumeratorTest.java │ ├── connector-hive/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── hive/ │ │ │ ├── commit/ │ │ │ │ └── HiveSinkAggregatedCommitter.java │ │ │ ├── config/ │ │ │ │ ├── HiveConfig.java │ │ │ │ ├── HiveConstants.java │ │ │ │ ├── HiveOnS3Conf.java │ │ │ │ └── HiveOptions.java │ │ │ ├── exception/ │ │ │ │ ├── HiveConnectorErrorCode.java │ │ │ │ └── HiveConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── HiveSaveModeHandler.java │ │ │ │ ├── HiveSink.java │ │ │ │ ├── HiveSinkFactory.java │ │ │ │ ├── HiveSinkOptions.java │ │ │ │ └── writter/ │ │ │ │ └── HiveSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── HiveSource.java │ │ │ │ ├── HiveSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── HiveSourceConfig.java │ │ │ │ │ ├── HiveSourceTableDiscovery.java │ │ │ │ │ ├── HiveTableNamePattern.java │ │ │ │ │ └── MultipleTableHiveSourceConfig.java │ │ │ │ ├── reader/ │ │ │ │ │ └── MultipleTableHiveSourceReader.java │ │ │ │ ├── split/ │ │ │ │ │ ├── HiveSourceSplit.java │ │ │ │ │ └── MultipleTableHiveSourceSplitEnumerator.java │ │ │ │ └── state/ │ │ │ │ └── HiveSourceState.java │ │ │ ├── storage/ │ │ │ │ ├── AbstractStorage.java │ │ │ │ ├── COSStorage.java │ │ │ │ ├── HDFSStorage.java │ │ │ │ ├── OSSStorage.java │ │ │ │ ├── S3Storage.java │ │ │ │ ├── Storage.java │ │ │ │ ├── StorageFactory.java │ │ │ │ └── StorageType.java │ │ │ └── utils/ │ │ │ ├── HiveFormatUtils.java │ │ │ ├── HiveLocationUtils.java │ │ │ ├── HiveMetaStoreCatalog.java │ │ │ ├── HiveMetaStoreProxy.java │ │ │ ├── HiveMetaStoreProxyUtils.java │ │ │ ├── HiveTableTemplateUtils.java │ │ │ ├── HiveTableUtils.java │ │ │ └── HiveTypeConvertor.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── hive/ │ │ │ ├── HiveFactoryTest.java │ │ │ ├── commit/ │ │ │ │ └── HiveSinkAggregatedCommitterOverwriteStreamingTest.java │ │ │ ├── sink/ │ │ │ │ ├── HiveSaveModeHandlerTest.java │ │ │ │ ├── HiveSinkConfigTest.java │ │ │ │ ├── HiveSinkFactoryTest.java │ │ │ │ └── HiveSinkOptionsTest.java │ │ │ ├── source/ │ │ │ │ └── config/ │ │ │ │ ├── HiveSourceConfigEmptyFilesTest.java │ │ │ │ └── HiveSourceTableDiscoveryTest.java │ │ │ ├── split/ │ │ │ │ └── MultipleTableHiveSourceSplitEnumeratorTest.java │ │ │ ├── storage/ │ │ │ │ ├── CosStorageTest.java │ │ │ │ ├── HDFSStorageTest.java │ │ │ │ ├── OSSStorageTest.java │ │ │ │ ├── S3StorageTest.java │ │ │ │ └── StorageFactoryTest.java │ │ │ └── utils/ │ │ │ ├── HiveMetaStoreCatalogKerberosRenewTest.java │ │ │ ├── HiveMetaStoreCatalogMetastoreUrisTest.java │ │ │ ├── HiveMetaStoreProxyUtilsTest.java │ │ │ ├── HiveTableTemplateUtilsTest.java │ │ │ └── HiveTypeConvertorTest.java │ │ └── resources/ │ │ ├── cos/ │ │ │ └── core-site.xml │ │ ├── fakesource_to_hive.conf │ │ ├── hive_with_kerberos.conf │ │ ├── hive_with_remoteuser.conf │ │ ├── hive_without_kerberos.conf │ │ ├── oss/ │ │ │ └── core-site.xml │ │ └── s3/ │ │ └── core-site.xml │ ├── connector-http/ │ │ ├── connector-http-airtable/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── airtable/ │ │ │ │ ├── config/ │ │ │ │ │ └── AirtableConfig.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── AirtableSink.java │ │ │ │ │ ├── AirtableSinkFactory.java │ │ │ │ │ ├── AirtableSinkWriter.java │ │ │ │ │ └── config/ │ │ │ │ │ └── AirtableSinkOptions.java │ │ │ │ └── source/ │ │ │ │ ├── AirtableSource.java │ │ │ │ ├── AirtableSourceFactory.java │ │ │ │ ├── AirtableSourceReader.java │ │ │ │ └── config/ │ │ │ │ ├── AirtableSourceOptions.java │ │ │ │ └── AirtableSourceParameter.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── airtable/ │ │ │ ├── AirtableFactoryTest.java │ │ │ ├── sink/ │ │ │ │ └── AirtableSinkWriterTest.java │ │ │ └── source/ │ │ │ └── AirtableSourceReaderTest.java │ │ ├── connector-http-base/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── http/ │ │ │ │ ├── client/ │ │ │ │ │ ├── HttpClientProvider.java │ │ │ │ │ └── HttpResponse.java │ │ │ │ ├── config/ │ │ │ │ │ ├── HttpCommonOptions.java │ │ │ │ │ ├── HttpConfig.java │ │ │ │ │ ├── HttpPaginationType.java │ │ │ │ │ ├── HttpParameter.java │ │ │ │ │ ├── HttpRequestMethod.java │ │ │ │ │ ├── HttpSinkOptions.java │ │ │ │ │ ├── HttpSourceOptions.java │ │ │ │ │ ├── JsonField.java │ │ │ │ │ └── PageInfo.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── HttpConnectorErrorCode.java │ │ │ │ │ └── HttpConnectorException.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── HttpSink.java │ │ │ │ │ ├── HttpSinkFactory.java │ │ │ │ │ └── HttpSinkWriter.java │ │ │ │ ├── source/ │ │ │ │ │ ├── DeserializationCollector.java │ │ │ │ │ ├── HttpSource.java │ │ │ │ │ ├── HttpSourceFactory.java │ │ │ │ │ ├── HttpSourceReader.java │ │ │ │ │ └── SimpleTextDeserializationSchema.java │ │ │ │ └── util/ │ │ │ │ ├── ArrayJsonPathProcessor.java │ │ │ │ ├── AuthorizationUtil.java │ │ │ │ ├── JsonPathProcessor.java │ │ │ │ ├── JsonPathProcessorFactory.java │ │ │ │ ├── JsonPathProcessorImpl.java │ │ │ │ └── JsonPathUtils.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── http/ │ │ │ ├── HttpFactoryTest.java │ │ │ ├── HttpSourceReaderInternalPollNextTest.java │ │ │ ├── HttpSourceReaderUpdateRequestParamTest.java │ │ │ ├── JsonFieldMissedReturnNullComplexTest.java │ │ │ ├── JsonFieldMissedReturnNullTest.java │ │ │ ├── JsonFieldMissedReturnNullTreeFeatureTest.java │ │ │ ├── client/ │ │ │ │ └── HttpClientProviderTest.java │ │ │ └── sink/ │ │ │ └── HttpSinkBatchWriterTest.java │ │ ├── connector-http-feishu/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── feishu/ │ │ │ └── sink/ │ │ │ ├── FeishuSink.java │ │ │ ├── FeishuSinkFactory.java │ │ │ └── FeishuSinkOptions.java │ │ ├── connector-http-github/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── github/ │ │ │ │ ├── config/ │ │ │ │ │ ├── GithubSourceOptions.java │ │ │ │ │ └── GithubSourceParameter.java │ │ │ │ ├── exception/ │ │ │ │ │ └── GithubConnectorException.java │ │ │ │ └── source/ │ │ │ │ ├── GithubSource.java │ │ │ │ └── GithubSourceFactory.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── github/ │ │ │ └── GithubFactoryTest.java │ │ ├── connector-http-gitlab/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── gitlab/ │ │ │ │ └── source/ │ │ │ │ ├── GitlabSource.java │ │ │ │ ├── GitlabSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── GitlabSourceOptions.java │ │ │ │ │ └── GitlabSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ └── GitlabConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── gitlab/ │ │ │ └── GitlabFactoryTest.java │ │ ├── connector-http-jira/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── jira/ │ │ │ │ └── source/ │ │ │ │ ├── JiraSource.java │ │ │ │ ├── JiraSourceFactory.java │ │ │ │ └── config/ │ │ │ │ ├── JiraSourceOptions.java │ │ │ │ └── JiraSourceParameter.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── jira/ │ │ │ └── JiraFactoryTest.java │ │ ├── connector-http-klaviyo/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── klaviyo/ │ │ │ │ └── source/ │ │ │ │ ├── KlaviyoSource.java │ │ │ │ ├── KlaviyoSourceFactory.java │ │ │ │ └── config/ │ │ │ │ ├── KlaviyoSourceOptions.java │ │ │ │ ├── KlaviyoSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ └── KlaviyoConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── klaviyo/ │ │ │ └── KlaviyoFactoryTest.java │ │ ├── connector-http-lemlist/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── lemlist/ │ │ │ │ └── source/ │ │ │ │ ├── LemlistSource.java │ │ │ │ ├── LemlistSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── LemlistSourceOptions.java │ │ │ │ │ └── LemlistSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ └── LemlistConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── lemlist/ │ │ │ └── LemlistFactoryTest.java │ │ ├── connector-http-myhours/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── myhours/ │ │ │ │ └── source/ │ │ │ │ ├── MyHoursSource.java │ │ │ │ ├── MyHoursSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── MyHoursSourceOptions.java │ │ │ │ │ └── MyHoursSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ ├── MyHoursConnectorErrorCode.java │ │ │ │ └── MyHoursConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── myhours/ │ │ │ └── MyHoursFactoryTest.java │ │ ├── connector-http-notion/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── notion/ │ │ │ │ └── source/ │ │ │ │ ├── NotionSource.java │ │ │ │ ├── NotionSourceFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── NotionSourceOptions.java │ │ │ │ │ └── NotionSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ └── NotionConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── notion/ │ │ │ └── NotionFactoryTest.java │ │ ├── connector-http-onesignal/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── onesignal/ │ │ │ │ └── source/ │ │ │ │ ├── OneSignalSource.java │ │ │ │ ├── OneSignalSourceFactory.java │ │ │ │ └── config/ │ │ │ │ ├── OneSignalSourceOptions.java │ │ │ │ ├── OneSignalSourceParameter.java │ │ │ │ └── exception/ │ │ │ │ └── OneSignalConnectorException.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── onesignal/ │ │ │ └── OneSignalFactoryTest.java │ │ ├── connector-http-persistiq/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── persistiq/ │ │ │ │ └── source/ │ │ │ │ ├── PersistiqSource.java │ │ │ │ ├── PersistiqSourceFactory.java │ │ │ │ └── config/ │ │ │ │ ├── PersistiqSourceOptions.java │ │ │ │ └── PersistiqSourceParameter.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── persistiq/ │ │ │ └── PersistiqFactoryTest.java │ │ ├── connector-http-wechat/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── wechat/ │ │ │ │ └── sink/ │ │ │ │ ├── WeChatBotMessageSerializationSchema.java │ │ │ │ ├── WeChatSink.java │ │ │ │ ├── WeChatSinkFactory.java │ │ │ │ └── config/ │ │ │ │ ├── WeChatSinkConfig.java │ │ │ │ └── WeChatSinkOptions.java │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── wechat/ │ │ │ └── WeChatFactoryTest.java │ │ └── pom.xml │ ├── connector-hudi/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── hudi/ │ │ │ ├── catalog/ │ │ │ │ ├── HudiCatalog.java │ │ │ │ └── HudiCatalogFactory.java │ │ │ ├── config/ │ │ │ │ ├── HudiSinkConfig.java │ │ │ │ ├── HudiSinkOptions.java │ │ │ │ └── HudiTableConfig.java │ │ │ ├── exception/ │ │ │ │ ├── HudiConnectorException.java │ │ │ │ ├── HudiError.java │ │ │ │ └── HudiErrorCode.java │ │ │ ├── sink/ │ │ │ │ ├── HudiClientManager.java │ │ │ │ ├── HudiMultiTableResourceManager.java │ │ │ │ ├── HudiSink.java │ │ │ │ ├── HudiSinkFactory.java │ │ │ │ ├── client/ │ │ │ │ │ ├── HudiWriteClientProvider.java │ │ │ │ │ ├── HudiWriteClientProviderProxy.java │ │ │ │ │ └── WriteClientProvider.java │ │ │ │ ├── convert/ │ │ │ │ │ ├── AvroSchemaConverter.java │ │ │ │ │ ├── HudiRecordConverter.java │ │ │ │ │ └── RowDataToAvroConverters.java │ │ │ │ ├── state/ │ │ │ │ │ ├── HudiAggregatedCommitInfo.java │ │ │ │ │ ├── HudiCommitInfo.java │ │ │ │ │ └── HudiSinkState.java │ │ │ │ └── writer/ │ │ │ │ ├── HudiRecordWriter.java │ │ │ │ └── HudiSinkWriter.java │ │ │ ├── state/ │ │ │ │ ├── HudiAggregatedCommitInfo.java │ │ │ │ ├── HudiCommitInfo.java │ │ │ │ └── HudiSinkState.java │ │ │ └── util/ │ │ │ ├── HudiCatalogUtil.java │ │ │ ├── HudiUtil.java │ │ │ └── SchemaUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── hudi/ │ │ ├── HudiErrorTest.java │ │ ├── HudiTest.java │ │ └── catalog/ │ │ └── HudiCatalogTest.java │ ├── connector-hugegraph/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── hugegraph/ │ │ │ ├── buffer/ │ │ │ │ └── BatchBuffer.java │ │ │ ├── client/ │ │ │ │ └── HugeGraphClient.java │ │ │ ├── config/ │ │ │ │ ├── HugeGraphOptions.java │ │ │ │ ├── HugeGraphSinkConfig.java │ │ │ │ ├── HugeGraphSinkOptions.java │ │ │ │ ├── MappingConfig.java │ │ │ │ └── SchemaConfig.java │ │ │ ├── exception/ │ │ │ │ ├── HugeGraphConnectorErrorCode.java │ │ │ │ └── HugeGraphConnectorException.java │ │ │ ├── mapper/ │ │ │ │ ├── EdgeMapper.java │ │ │ │ ├── GraphDataMapper.java │ │ │ │ └── VertexMapper.java │ │ │ ├── sink/ │ │ │ │ ├── HugeGraphSink.java │ │ │ │ ├── HugeGraphSinkFactory.java │ │ │ │ └── HugeGraphSinkWriter.java │ │ │ └── utils/ │ │ │ ├── DataTypeUtil.java │ │ │ ├── E.java │ │ │ └── SchemaValidator.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── hugegraph/ │ │ └── config/ │ │ └── HugeGraphSinkConfigTest.java │ ├── connector-iceberg/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── iceberg/ │ │ │ ├── IcebergCatalogLoader.java │ │ │ ├── IcebergTableLoader.java │ │ │ ├── catalog/ │ │ │ │ ├── IcebergCatalog.java │ │ │ │ └── IcebergCatalogFactory.java │ │ │ ├── config/ │ │ │ │ ├── IcebergCatalogType.java │ │ │ │ ├── IcebergCommonConfig.java │ │ │ │ ├── IcebergCommonOptions.java │ │ │ │ ├── IcebergSinkConfig.java │ │ │ │ ├── IcebergSinkOptions.java │ │ │ │ ├── IcebergSourceConfig.java │ │ │ │ ├── IcebergSourceOptions.java │ │ │ │ └── SourceTableConfig.java │ │ │ ├── data/ │ │ │ │ ├── DefaultDeserializer.java │ │ │ │ ├── Deserializer.java │ │ │ │ ├── IcebergRecordProjection.java │ │ │ │ ├── IcebergTypeMapper.java │ │ │ │ └── RowConverter.java │ │ │ ├── exception/ │ │ │ │ ├── IcebergConnectorErrorCode.java │ │ │ │ └── IcebergConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── IcebergSink.java │ │ │ │ ├── IcebergSinkFactory.java │ │ │ │ ├── IcebergSinkWriter.java │ │ │ │ ├── commit/ │ │ │ │ │ ├── IcebergAggregatedCommitInfo.java │ │ │ │ │ ├── IcebergAggregatedCommitter.java │ │ │ │ │ ├── IcebergCommitInfo.java │ │ │ │ │ └── IcebergFilesCommitter.java │ │ │ │ ├── schema/ │ │ │ │ │ ├── ISchemaChange.java │ │ │ │ │ ├── SchemaAddColumn.java │ │ │ │ │ ├── SchemaChangeColumn.java │ │ │ │ │ ├── SchemaChangeWrapper.java │ │ │ │ │ ├── SchemaDeleteColumn.java │ │ │ │ │ └── SchemaModifyColumn.java │ │ │ │ ├── state/ │ │ │ │ │ └── IcebergSinkState.java │ │ │ │ └── writer/ │ │ │ │ ├── BaseDeltaTaskWriter.java │ │ │ │ ├── IcebergRecord.java │ │ │ │ ├── IcebergRecordWriter.java │ │ │ │ ├── IcebergWriterFactory.java │ │ │ │ ├── PartitionedAppendWriter.java │ │ │ │ ├── PartitionedDeltaWriter.java │ │ │ │ ├── RecordProjection.java │ │ │ │ ├── RecordWriter.java │ │ │ │ ├── UnpartitionedDeltaWriter.java │ │ │ │ └── WriteResult.java │ │ │ ├── source/ │ │ │ │ ├── IcebergSource.java │ │ │ │ ├── IcebergSourceFactory.java │ │ │ │ ├── enumerator/ │ │ │ │ │ ├── AbstractSplitEnumerator.java │ │ │ │ │ ├── IcebergBatchSplitEnumerator.java │ │ │ │ │ ├── IcebergEnumerationResult.java │ │ │ │ │ ├── IcebergEnumeratorPosition.java │ │ │ │ │ ├── IcebergSplitEnumeratorState.java │ │ │ │ │ ├── IcebergStreamSplitEnumerator.java │ │ │ │ │ └── scan/ │ │ │ │ │ ├── IcebergScanContext.java │ │ │ │ │ ├── IcebergScanSplitPlanner.java │ │ │ │ │ └── IcebergStreamScanStrategy.java │ │ │ │ ├── reader/ │ │ │ │ │ ├── IcebergFileScanTaskReader.java │ │ │ │ │ ├── IcebergFileScanTaskSplitReader.java │ │ │ │ │ └── IcebergSourceReader.java │ │ │ │ └── split/ │ │ │ │ └── IcebergFileScanTaskSplit.java │ │ │ └── utils/ │ │ │ ├── ExpressionUtils.java │ │ │ └── SchemaUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── iceberg/ │ │ ├── IcebergFactoryTest.java │ │ ├── TestIcebergMetastore.java │ │ ├── TypeConvertTest.java │ │ ├── catalog/ │ │ │ ├── IcebergCatalogTest.java │ │ │ └── PreviewActionTest.java │ │ ├── config/ │ │ │ └── IcebergSinkConfigTest.java │ │ ├── data/ │ │ │ ├── IcebergTypeMapperTest.java │ │ │ └── RowConverterTest.java │ │ ├── source/ │ │ │ └── enumerator/ │ │ │ └── IcebergStreamSplitEnumeratorTest.java │ │ └── utils/ │ │ ├── ExpressionUtilsTest.java │ │ └── SchemaUtilsTest.java │ ├── connector-influxdb/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── influxdb/ │ │ │ ├── client/ │ │ │ │ └── InfluxDBClient.java │ │ │ ├── config/ │ │ │ │ ├── InfluxDBCommonOptions.java │ │ │ │ ├── InfluxDBConfig.java │ │ │ │ ├── InfluxDBSinkOptions.java │ │ │ │ ├── InfluxDBSourceOptions.java │ │ │ │ ├── SinkConfig.java │ │ │ │ ├── SourceConfig.java │ │ │ │ └── TimePrecision.java │ │ │ ├── converter/ │ │ │ │ └── InfluxDBRowConverter.java │ │ │ ├── exception/ │ │ │ │ ├── InfluxdbConnectorErrorCode.java │ │ │ │ └── InfluxdbConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSerializer.java │ │ │ │ └── Serializer.java │ │ │ ├── sink/ │ │ │ │ ├── InfluxDBSink.java │ │ │ │ ├── InfluxDBSinkFactory.java │ │ │ │ └── InfluxDBSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── InfluxDBSource.java │ │ │ │ ├── InfluxDBSourceFactory.java │ │ │ │ ├── InfluxDBSourceSplit.java │ │ │ │ ├── InfluxDBSourceSplitEnumerator.java │ │ │ │ └── InfluxdbSourceReader.java │ │ │ └── state/ │ │ │ └── InfluxDBSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── influxdb/ │ │ └── InfluxDBFactoryTest.java │ ├── connector-iotdb/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── iotdb/ │ │ │ ├── config/ │ │ │ │ ├── CommonConfig.java │ │ │ │ ├── IoTDBCommonOptions.java │ │ │ │ ├── IoTDBSinkOptions.java │ │ │ │ ├── IoTDBSourceOptions.java │ │ │ │ └── SinkConfig.java │ │ │ ├── constant/ │ │ │ │ └── SourceConstants.java │ │ │ ├── exception/ │ │ │ │ ├── IotdbConnectorErrorCode.java │ │ │ │ └── IotdbConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ ├── IoTDBRecord.java │ │ │ │ ├── SeaTunnelRowDeserializer.java │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── IoTDBSink.java │ │ │ │ ├── IoTDBSinkClient.java │ │ │ │ ├── IoTDBSinkFactory.java │ │ │ │ └── IoTDBSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── IoTDBSource.java │ │ │ │ ├── IoTDBSourceFactory.java │ │ │ │ ├── IoTDBSourceReader.java │ │ │ │ ├── IoTDBSourceSplit.java │ │ │ │ └── IoTDBSourceSplitEnumerator.java │ │ │ └── state/ │ │ │ └── IoTDBSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── iotdb/ │ │ └── IoTDBFactoryTest.java │ ├── connector-iotdb-v2/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── iotdbv2/ │ │ │ ├── config/ │ │ │ │ ├── CommonConfig.java │ │ │ │ ├── IoTDBv2CommonOptions.java │ │ │ │ ├── IoTDBv2SinkOptions.java │ │ │ │ ├── IoTDBv2SourceOptions.java │ │ │ │ └── SinkConfig.java │ │ │ ├── constant/ │ │ │ │ ├── SinkConstants.java │ │ │ │ └── SourceConstants.java │ │ │ ├── exception/ │ │ │ │ ├── IotdbConnectorErrorCode.java │ │ │ │ └── IotdbConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ ├── IoTDBv2Record.java │ │ │ │ ├── SeaTunnelRowDeserializer.java │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ └── relational/ │ │ │ │ ├── IoTDBv2RelationalRecord.java │ │ │ │ └── RelationalSeaTunnelRowSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── IoTDBv2Sink.java │ │ │ │ ├── IoTDBv2SinkClient.java │ │ │ │ ├── IoTDBv2SinkFactory.java │ │ │ │ ├── IoTDBv2SinkWriter.java │ │ │ │ └── relational/ │ │ │ │ ├── IoTDBv2RelationalSinkClient.java │ │ │ │ └── IoTDBv2RelationalSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── IoTDBv2AbstractSourceReader.java │ │ │ │ ├── IoTDBv2Source.java │ │ │ │ ├── IoTDBv2SourceFactory.java │ │ │ │ ├── IoTDBv2SourceReader.java │ │ │ │ ├── IoTDBv2SourceSplit.java │ │ │ │ ├── IoTDBv2SourceSplitEnumerator.java │ │ │ │ └── relational/ │ │ │ │ └── IoTDBv2RelationalSourceReader.java │ │ │ └── state/ │ │ │ └── IoTDBv2SourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── iotdbv2/ │ │ └── IoTDBFactoryTest.java │ ├── connector-jdbc/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── jdbc/ │ │ │ ├── catalog/ │ │ │ │ ├── AbstractJdbcCatalog.java │ │ │ │ ├── AbstractJdbcCreateTableSqlBuilder.java │ │ │ │ ├── dm/ │ │ │ │ │ ├── DamengCatalog.java │ │ │ │ │ ├── DamengCatalogFactory.java │ │ │ │ │ ├── DamengCreateTableSqlBuilder.java │ │ │ │ │ └── DamengDataTypeConvertor.java │ │ │ │ ├── duckdb/ │ │ │ │ │ ├── DuckDBCatalog.java │ │ │ │ │ ├── DuckDBCatalogFactory.java │ │ │ │ │ ├── DuckDBCreateTableSqlBuilder.java │ │ │ │ │ └── DuckDBURLParser.java │ │ │ │ ├── highgo/ │ │ │ │ │ ├── HighGoCatalog.java │ │ │ │ │ └── HighGoCatalogFactory.java │ │ │ │ ├── iris/ │ │ │ │ │ ├── IrisCatalog.java │ │ │ │ │ ├── IrisCatalogFactory.java │ │ │ │ │ ├── IrisCreateTableSqlBuilder.java │ │ │ │ │ └── savemode/ │ │ │ │ │ └── IrisSaveModeHandler.java │ │ │ │ ├── kingbase/ │ │ │ │ │ ├── KingbaseCatalog.java │ │ │ │ │ ├── KingbaseCatalogFactory.java │ │ │ │ │ └── KingbaseCreateTableSqlBuilder.java │ │ │ │ ├── mysql/ │ │ │ │ │ ├── MySqlCatalog.java │ │ │ │ │ ├── MySqlCatalogFactory.java │ │ │ │ │ ├── MysqlCreateTableSqlBuilder.java │ │ │ │ │ └── MysqlDataTypeConvertor.java │ │ │ │ ├── oceanbase/ │ │ │ │ │ ├── OceanBaseCatalogFactory.java │ │ │ │ │ ├── OceanBaseMySqlCatalog.java │ │ │ │ │ ├── OceanBaseMysqlCreateTableSqlBuilder.java │ │ │ │ │ ├── OceanBaseOracleCatalog.java │ │ │ │ │ └── OceanBaseOracleCreateTableSqlBuilder.java │ │ │ │ ├── opengauss/ │ │ │ │ │ ├── OpenGaussCatalog.java │ │ │ │ │ └── OpenGaussCatalogFactory.java │ │ │ │ ├── oracle/ │ │ │ │ │ ├── OracleCatalog.java │ │ │ │ │ ├── OracleCatalogFactory.java │ │ │ │ │ ├── OracleCreateTableSqlBuilder.java │ │ │ │ │ ├── OracleDataTypeConvertor.java │ │ │ │ │ └── OracleURLParser.java │ │ │ │ ├── psql/ │ │ │ │ │ ├── PostgresCatalog.java │ │ │ │ │ ├── PostgresCatalogFactory.java │ │ │ │ │ ├── PostgresCreateTableSqlBuilder.java │ │ │ │ │ └── PostgresDataTypeConvertor.java │ │ │ │ ├── redshift/ │ │ │ │ │ ├── RedshiftCatalog.java │ │ │ │ │ ├── RedshiftCatalogFactory.java │ │ │ │ │ ├── RedshiftCreateTableSqlBuilder.java │ │ │ │ │ └── RedshiftDataTypeConvertor.java │ │ │ │ ├── saphana/ │ │ │ │ │ ├── SapHanaCatalog.java │ │ │ │ │ ├── SapHanaCatalogFactory.java │ │ │ │ │ ├── SapHanaCreateTableSqlBuilder.java │ │ │ │ │ └── SapHanaURLParser.java │ │ │ │ ├── snowflake/ │ │ │ │ │ └── SnowflakeDataTypeConvertor.java │ │ │ │ ├── sqlserver/ │ │ │ │ │ ├── SqlServerCatalog.java │ │ │ │ │ ├── SqlServerCatalogFactory.java │ │ │ │ │ ├── SqlServerCreateTableSqlBuilder.java │ │ │ │ │ ├── SqlServerDataTypeConvertor.java │ │ │ │ │ ├── SqlServerType.java │ │ │ │ │ └── SqlServerURLParser.java │ │ │ │ ├── tidb/ │ │ │ │ │ ├── TiDBCatalog.java │ │ │ │ │ ├── TiDBCatalogFactory.java │ │ │ │ │ └── TiDBDataTypeConvertor.java │ │ │ │ ├── utils/ │ │ │ │ │ ├── CatalogUtils.java │ │ │ │ │ ├── JdbcColumnConverter.java │ │ │ │ │ └── JdbcIdentifierUtils.java │ │ │ │ └── xugu/ │ │ │ │ ├── XuguCatalog.java │ │ │ │ ├── XuguCatalogFactory.java │ │ │ │ └── XuguCreateTableSqlBuilder.java │ │ │ ├── config/ │ │ │ │ ├── JdbcCommonOptions.java │ │ │ │ ├── JdbcConnectionConfig.java │ │ │ │ ├── JdbcSinkConfig.java │ │ │ │ ├── JdbcSinkOptions.java │ │ │ │ ├── JdbcSourceConfig.java │ │ │ │ ├── JdbcSourceOptions.java │ │ │ │ └── JdbcSourceTableConfig.java │ │ │ ├── exception/ │ │ │ │ ├── JdbcConnectorErrorCode.java │ │ │ │ └── JdbcConnectorException.java │ │ │ ├── internal/ │ │ │ │ ├── JdbcInputFormat.java │ │ │ │ ├── JdbcOutputFormat.java │ │ │ │ ├── JdbcOutputFormatBuilder.java │ │ │ │ ├── connection/ │ │ │ │ │ ├── DataSourceUtils.java │ │ │ │ │ ├── JdbcConnectionProvider.java │ │ │ │ │ ├── SimpleJdbcConnectionPoolProviderProxy.java │ │ │ │ │ └── SimpleJdbcConnectionProvider.java │ │ │ │ ├── converter/ │ │ │ │ │ ├── AbstractJdbcRowConverter.java │ │ │ │ │ └── JdbcRowConverter.java │ │ │ │ ├── dialect/ │ │ │ │ │ ├── DatabaseIdentifier.java │ │ │ │ │ ├── GenericDialect.java │ │ │ │ │ ├── GenericDialectFactory.java │ │ │ │ │ ├── GenericTypeConverter.java │ │ │ │ │ ├── GenericTypeMapper.java │ │ │ │ │ ├── JdbcDialect.java │ │ │ │ │ ├── JdbcDialectFactory.java │ │ │ │ │ ├── JdbcDialectLoader.java │ │ │ │ │ ├── JdbcDialectTypeMapper.java │ │ │ │ │ ├── SQLUtils.java │ │ │ │ │ ├── db2/ │ │ │ │ │ │ ├── DB2Dialect.java │ │ │ │ │ │ ├── DB2DialectFactory.java │ │ │ │ │ │ ├── DB2JdbcRowConverter.java │ │ │ │ │ │ ├── DB2TypeConverter.java │ │ │ │ │ │ └── DB2TypeMapper.java │ │ │ │ │ ├── dialectenum/ │ │ │ │ │ │ └── FieldIdeEnum.java │ │ │ │ │ ├── dm/ │ │ │ │ │ │ ├── DmdbDialect.java │ │ │ │ │ │ ├── DmdbDialectFactory.java │ │ │ │ │ │ ├── DmdbJdbcRowConverter.java │ │ │ │ │ │ ├── DmdbTypeConverter.java │ │ │ │ │ │ └── DmdbTypeMapper.java │ │ │ │ │ ├── dsql/ │ │ │ │ │ │ ├── DdsqlJdbcConnectionPoolProviderProxy.java │ │ │ │ │ │ ├── DsqlConnectionPoolManager.java │ │ │ │ │ │ ├── DsqlDialect.java │ │ │ │ │ │ ├── DsqlDialectFactory.java │ │ │ │ │ │ ├── DsqlJdbcConnectionProvider.java │ │ │ │ │ │ └── DsqlJdbcRowConverter.java │ │ │ │ │ ├── duckdb/ │ │ │ │ │ │ ├── DuckDBDialect.java │ │ │ │ │ │ ├── DuckDBDialectFactory.java │ │ │ │ │ │ ├── DuckDBJdbcRowConverter.java │ │ │ │ │ │ ├── DuckDBTypeConverter.java │ │ │ │ │ │ └── DuckDBTypeMapper.java │ │ │ │ │ ├── gbase8a/ │ │ │ │ │ │ ├── Gbase8aDialect.java │ │ │ │ │ │ ├── Gbase8aDialectFactory.java │ │ │ │ │ │ ├── Gbase8aJdbcRowConverter.java │ │ │ │ │ │ └── Gbase8aTypeMapper.java │ │ │ │ │ ├── greenplum/ │ │ │ │ │ │ └── GreenplumDialectFactory.java │ │ │ │ │ ├── highgo/ │ │ │ │ │ │ └── HighGoDialectFactory.java │ │ │ │ │ ├── hive/ │ │ │ │ │ │ ├── HadoopLoginFactory.java │ │ │ │ │ │ ├── HiveDialect.java │ │ │ │ │ │ ├── HiveDialectFactory.java │ │ │ │ │ │ ├── HiveJdbcConnectionProvider.java │ │ │ │ │ │ ├── HiveJdbcRowConverter.java │ │ │ │ │ │ └── HiveTypeMapper.java │ │ │ │ │ ├── inceptor/ │ │ │ │ │ │ ├── InceptorDialect.java │ │ │ │ │ │ └── InceptorJdbcRowConverter.java │ │ │ │ │ ├── iris/ │ │ │ │ │ │ ├── IrisDialect.java │ │ │ │ │ │ ├── IrisDialectFactory.java │ │ │ │ │ │ ├── IrisJdbcRowConverter.java │ │ │ │ │ │ ├── IrisTypeConverter.java │ │ │ │ │ │ └── IrisTypeMapper.java │ │ │ │ │ ├── kingbase/ │ │ │ │ │ │ ├── KingbaseDialect.java │ │ │ │ │ │ ├── KingbaseDialectFactory.java │ │ │ │ │ │ ├── KingbaseJdbcRowConverter.java │ │ │ │ │ │ ├── KingbaseTypeConverter.java │ │ │ │ │ │ └── KingbaseTypeMapper.java │ │ │ │ │ ├── mysql/ │ │ │ │ │ │ ├── MySqlDialectFactory.java │ │ │ │ │ │ ├── MySqlTypeConverter.java │ │ │ │ │ │ ├── MySqlTypeMapper.java │ │ │ │ │ │ ├── MySqlVersion.java │ │ │ │ │ │ ├── MysqlDialect.java │ │ │ │ │ │ └── MysqlJdbcRowConverter.java │ │ │ │ │ ├── oceanbase/ │ │ │ │ │ │ ├── OceanBaseDialectFactory.java │ │ │ │ │ │ ├── OceanBaseMySqlTypeConverter.java │ │ │ │ │ │ ├── OceanBaseMySqlTypeMapper.java │ │ │ │ │ │ ├── OceanBaseMysqlDialect.java │ │ │ │ │ │ ├── OceanBaseMysqlJdbcRowConverter.java │ │ │ │ │ │ └── OceanBaseMysqlType.java │ │ │ │ │ ├── opengauss/ │ │ │ │ │ │ ├── OpenGaussDialect.java │ │ │ │ │ │ └── OpenGaussDialectFactory.java │ │ │ │ │ ├── oracle/ │ │ │ │ │ │ ├── OracleDialect.java │ │ │ │ │ │ ├── OracleDialectFactory.java │ │ │ │ │ │ ├── OracleJdbcRowConverter.java │ │ │ │ │ │ ├── OracleTypeConverter.java │ │ │ │ │ │ └── OracleTypeMapper.java │ │ │ │ │ ├── phoenix/ │ │ │ │ │ │ ├── PhoenixDialect.java │ │ │ │ │ │ ├── PhoenixDialectFactory.java │ │ │ │ │ │ ├── PhoenixJdbcRowConverter.java │ │ │ │ │ │ ├── PhoenixTypeConverter.java │ │ │ │ │ │ └── PhoenixTypeMapper.java │ │ │ │ │ ├── presto/ │ │ │ │ │ │ ├── PrestoDialect.java │ │ │ │ │ │ ├── PrestoDialectFactory.java │ │ │ │ │ │ ├── PrestoJdbcRowConverter.java │ │ │ │ │ │ └── PrestoTypeMapper.java │ │ │ │ │ ├── psql/ │ │ │ │ │ │ ├── PostgresDialect.java │ │ │ │ │ │ ├── PostgresDialectFactory.java │ │ │ │ │ │ ├── PostgresJdbcRowConverter.java │ │ │ │ │ │ ├── PostgresTypeConverter.java │ │ │ │ │ │ └── PostgresTypeMapper.java │ │ │ │ │ ├── psqllow/ │ │ │ │ │ │ └── PostgresLowDialect.java │ │ │ │ │ ├── redshift/ │ │ │ │ │ │ ├── RedshiftDialect.java │ │ │ │ │ │ ├── RedshiftDialectFactory.java │ │ │ │ │ │ ├── RedshiftJdbcRowConverter.java │ │ │ │ │ │ ├── RedshiftTypeConverter.java │ │ │ │ │ │ └── RedshiftTypeMapper.java │ │ │ │ │ ├── saphana/ │ │ │ │ │ │ ├── SapHanaDialect.java │ │ │ │ │ │ ├── SapHanaDialectFactory.java │ │ │ │ │ │ ├── SapHanaJdbcRowConverter.java │ │ │ │ │ │ ├── SapHanaTypeConverter.java │ │ │ │ │ │ └── SapHanaTypeMapper.java │ │ │ │ │ ├── snowflake/ │ │ │ │ │ │ ├── SnowflakeDialect.java │ │ │ │ │ │ ├── SnowflakeDialectFactory.java │ │ │ │ │ │ ├── SnowflakeJdbcRowConverter.java │ │ │ │ │ │ ├── SnowflakeTypeConverter.java │ │ │ │ │ │ └── SnowflakeTypeMapper.java │ │ │ │ │ ├── sqlite/ │ │ │ │ │ │ ├── SqliteDialect.java │ │ │ │ │ │ ├── SqliteDialectFactory.java │ │ │ │ │ │ ├── SqliteJdbcRowConverter.java │ │ │ │ │ │ └── SqliteTypeMapper.java │ │ │ │ │ ├── sqlserver/ │ │ │ │ │ │ ├── SqlServerDialect.java │ │ │ │ │ │ ├── SqlServerDialectFactory.java │ │ │ │ │ │ ├── SqlServerTypeConverter.java │ │ │ │ │ │ ├── SqlserverJdbcRowConverter.java │ │ │ │ │ │ └── SqlserverTypeMapper.java │ │ │ │ │ ├── starrocks/ │ │ │ │ │ │ └── StarRocksDialect.java │ │ │ │ │ ├── tablestore/ │ │ │ │ │ │ ├── TablestoreDialect.java │ │ │ │ │ │ ├── TablestoreDialectFactory.java │ │ │ │ │ │ ├── TablestoreJdbcRowConverter.java │ │ │ │ │ │ └── TablestoreTypeMapper.java │ │ │ │ │ ├── teradata/ │ │ │ │ │ │ ├── TeradataDialect.java │ │ │ │ │ │ ├── TeradataDialectFactory.java │ │ │ │ │ │ ├── TeradataJdbcRowConverter.java │ │ │ │ │ │ └── TeradataTypeMapper.java │ │ │ │ │ ├── vertica/ │ │ │ │ │ │ ├── VerticaDialect.java │ │ │ │ │ │ ├── VerticaDialectFactory.java │ │ │ │ │ │ ├── VerticaJdbcRowConverter.java │ │ │ │ │ │ └── VerticaTypeMapper.java │ │ │ │ │ └── xugu/ │ │ │ │ │ ├── XuguDialect.java │ │ │ │ │ ├── XuguDialectFactory.java │ │ │ │ │ ├── XuguJdbcRowConverter.java │ │ │ │ │ ├── XuguTypeConverter.java │ │ │ │ │ └── XuguTypeMapper.java │ │ │ │ ├── executor/ │ │ │ │ │ ├── BufferReducedBatchStatementExecutor.java │ │ │ │ │ ├── BufferedBatchStatementExecutor.java │ │ │ │ │ ├── CopyManagerBatchStatementExecutor.java │ │ │ │ │ ├── CopyManagerProxy.java │ │ │ │ │ ├── FieldNamedPreparedStatement.java │ │ │ │ │ ├── InsertOrUpdateBatchStatementExecutor.java │ │ │ │ │ ├── JdbcBatchStatementExecutor.java │ │ │ │ │ ├── SimpleBatchStatementExecutor.java │ │ │ │ │ └── StatementFactory.java │ │ │ │ ├── split/ │ │ │ │ │ ├── JdbcGenericParameterValuesProvider.java │ │ │ │ │ ├── JdbcNumericBetweenParametersProvider.java │ │ │ │ │ └── JdbcParameterValuesProvider.java │ │ │ │ └── xa/ │ │ │ │ ├── GroupXaOperationResult.java │ │ │ │ ├── SemanticXidGenerator.java │ │ │ │ ├── XaFacade.java │ │ │ │ ├── XaFacadeImplAutoLoad.java │ │ │ │ ├── XaGroupOps.java │ │ │ │ ├── XaGroupOpsImpl.java │ │ │ │ ├── XidGenerator.java │ │ │ │ └── XidImpl.java │ │ │ ├── sink/ │ │ │ │ ├── AbstractJdbcSinkWriter.java │ │ │ │ ├── ConnectionPoolManager.java │ │ │ │ ├── JdbcExactlyOnceSinkWriter.java │ │ │ │ ├── JdbcMultiTableResourceManager.java │ │ │ │ ├── JdbcSink.java │ │ │ │ ├── JdbcSinkAggregatedCommitter.java │ │ │ │ ├── JdbcSinkCommitter.java │ │ │ │ ├── JdbcSinkFactory.java │ │ │ │ ├── JdbcSinkWriter.java │ │ │ │ └── savemode/ │ │ │ │ └── JdbcSaveModeHandler.java │ │ │ ├── source/ │ │ │ │ ├── ChunkSplitter.java │ │ │ │ ├── CollationBasedSplitter.java │ │ │ │ ├── DynamicChunkSplitter.java │ │ │ │ ├── FixedChunkSplitter.java │ │ │ │ ├── JdbcSource.java │ │ │ │ ├── JdbcSourceFactory.java │ │ │ │ ├── JdbcSourceReader.java │ │ │ │ ├── JdbcSourceSplit.java │ │ │ │ ├── JdbcSourceSplitEnumerator.java │ │ │ │ ├── JdbcSourceTable.java │ │ │ │ └── StringSplitMode.java │ │ │ ├── state/ │ │ │ │ ├── JdbcAggregatedCommitInfo.java │ │ │ │ ├── JdbcSinkState.java │ │ │ │ ├── JdbcSourceState.java │ │ │ │ └── XidInfo.java │ │ │ └── utils/ │ │ │ ├── DefaultValueUtils.java │ │ │ ├── HiveJdbcUtils.java │ │ │ ├── JdbcCatalogUtils.java │ │ │ ├── JdbcFieldTypeUtils.java │ │ │ ├── ObjectUtils.java │ │ │ └── ThrowingRunnable.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── jdbc/ │ │ ├── JdbcFactoryTest.java │ │ ├── catalog/ │ │ │ ├── DataTypeConvertorTest.java │ │ │ ├── MysqlDataTypeConvertorTest.java │ │ │ ├── PreviewActionTest.java │ │ │ ├── SnowflakeDataTypeConvertorTest.java │ │ │ ├── dm/ │ │ │ │ ├── DamengCreateTableSqlBuilderTest.java │ │ │ │ └── DamengJdbcTest.java │ │ │ ├── driver/ │ │ │ │ ├── DriverSelectionTest.java │ │ │ │ ├── ExpectedDriver.java │ │ │ │ └── OtherDriver.java │ │ │ ├── duckdb/ │ │ │ │ └── DuckDBCatalogTest.java │ │ │ ├── iris/ │ │ │ │ └── IrisCreateTableSqlBuilderTest.java │ │ │ ├── kingbase/ │ │ │ │ ├── KingbaseCatalogTest.java │ │ │ │ └── KingbaseCreateTableSqlBuilderTest.java │ │ │ ├── mysql/ │ │ │ │ ├── MySqlCatalogTest.java │ │ │ │ └── MysqlCreateTableSqlBuilderTest.java │ │ │ ├── oceanbase/ │ │ │ │ └── OceanBaseOracleCreateTableSqlBuilderTest.java │ │ │ ├── oracle/ │ │ │ │ ├── OracleCatalogTest.java │ │ │ │ └── OracleCreateTableSqlBuilderTest.java │ │ │ ├── psql/ │ │ │ │ ├── PostgresCatalogTest.java │ │ │ │ └── PostgresCreateTableSqlBuilderTest.java │ │ │ ├── redshift/ │ │ │ │ ├── RedshiftCatalogTest.java │ │ │ │ └── RedshiftCreateTableSqlBuilderTest.java │ │ │ ├── saphana/ │ │ │ │ └── SapHanaCreateTableSqlBuilderTest.java │ │ │ ├── sqlserver/ │ │ │ │ ├── SqlServerCatalogTest.java │ │ │ │ ├── SqlServerCreateTableSqlBuilderTest.java │ │ │ │ └── SqlServerURLParserTest.java │ │ │ ├── utils/ │ │ │ │ ├── CatalogUtilsTest.java │ │ │ │ ├── TestConnection.java │ │ │ │ ├── TestDatabaseMetaData.java │ │ │ │ └── TestResultSet.java │ │ │ └── xugu/ │ │ │ └── XuguCreateTableSqlBuilderTest.java │ │ ├── internal/ │ │ │ ├── JdbcOutputFormatBuilderTest.java │ │ │ ├── dialect/ │ │ │ │ ├── JdbcDialectLoaderTest.java │ │ │ │ ├── PostgresDialectFactoryTest.java │ │ │ │ ├── db2/ │ │ │ │ │ ├── DB2DialectTest.java │ │ │ │ │ └── Db2TypeConverterTest.java │ │ │ │ ├── dm/ │ │ │ │ │ ├── DmdbDialectTest.java │ │ │ │ │ └── DmdbTypeConverterTest.java │ │ │ │ ├── duckdb/ │ │ │ │ │ ├── DuckDBDialectTest.java │ │ │ │ │ ├── DuckDBSourceAndSinkTest.java │ │ │ │ │ └── DuckDBTypeConverterTest.java │ │ │ │ ├── hive/ │ │ │ │ │ └── HiveDialectFactoryTest.java │ │ │ │ ├── iris/ │ │ │ │ │ └── IrisTypeConverterTest.java │ │ │ │ ├── kingbase/ │ │ │ │ │ ├── KingbaseTypeConverterTest.java │ │ │ │ │ └── container/ │ │ │ │ │ ├── AbstractKingbaseContainerTest.java │ │ │ │ │ ├── KingbaseCatalogContainerTest.java │ │ │ │ │ └── KingbaseDialectContainerTest.java │ │ │ │ ├── mysql/ │ │ │ │ │ ├── MySqlTypeConverterTest.java │ │ │ │ │ ├── MySqlTypeMapperTest.java │ │ │ │ │ ├── MysqlDialectTest.java │ │ │ │ │ └── MysqlVersionTest.java │ │ │ │ ├── oceanbase/ │ │ │ │ │ └── OceanBaseMySqlTypeMapperTest.java │ │ │ │ ├── opengauss/ │ │ │ │ │ └── OpenGaussDialectTest.java │ │ │ │ ├── oracle/ │ │ │ │ │ └── OracleTypeConverterTest.java │ │ │ │ ├── psql/ │ │ │ │ │ ├── PostgresDialectTest.java │ │ │ │ │ ├── PostgresJdbcRowConverterTest.java │ │ │ │ │ └── PostgresTypeConverterTest.java │ │ │ │ ├── redshift/ │ │ │ │ │ └── RedshiftTypeConverterTest.java │ │ │ │ ├── saphana/ │ │ │ │ │ └── SapHanaTypeConverterTest.java │ │ │ │ ├── sqlserver/ │ │ │ │ │ └── SqlServerTypeConverterTest.java │ │ │ │ ├── vertica/ │ │ │ │ │ └── VerticaDialectTest.java │ │ │ │ └── xugu/ │ │ │ │ └── XuguTypeConverterTest.java │ │ │ ├── executor/ │ │ │ │ ├── BufferExecutorTest.java │ │ │ │ ├── BufferReducedBatchStatementExecutorTest.java │ │ │ │ ├── BufferedBatchStatementExecutorTest.java │ │ │ │ └── FieldNamedPreparedStatementTest.java │ │ │ └── xa/ │ │ │ └── SemanticXidGeneratorTest.java │ │ ├── sink/ │ │ │ └── JdbcExactlyOnceSinkWriterTest.java │ │ ├── source/ │ │ │ ├── CharsetBasedSplitterTest.java │ │ │ ├── DynamicChunkSplitterTest.java │ │ │ ├── FixedChunkSplitterTest.java │ │ │ └── JdbcSourceSplitEnumeratorTest.java │ │ └── utils/ │ │ ├── JdbcCatalogUtilsTest.java │ │ ├── JdbcFieldTypeUtilsTest.java │ │ └── ObjectUtilsTest.java │ ├── connector-kafka/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── kafka/ │ │ │ ├── config/ │ │ │ │ ├── KafkaBaseConstants.java │ │ │ │ ├── KafkaBaseOptions.java │ │ │ │ ├── KafkaSemantics.java │ │ │ │ ├── KafkaSinkOptions.java │ │ │ │ ├── KafkaSourceOptions.java │ │ │ │ ├── MessageFormat.java │ │ │ │ ├── MessageFormatErrorHandleWay.java │ │ │ │ ├── StartMode.java │ │ │ │ └── TableIdentifierConfig.java │ │ │ ├── exception/ │ │ │ │ ├── KafkaConnectorErrorCode.java │ │ │ │ └── KafkaConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── KafkaInternalProducer.java │ │ │ │ ├── KafkaNoTransactionSender.java │ │ │ │ ├── KafkaProduceSender.java │ │ │ │ ├── KafkaSink.java │ │ │ │ ├── KafkaSinkCommitter.java │ │ │ │ ├── KafkaSinkFactory.java │ │ │ │ ├── KafkaSinkWriter.java │ │ │ │ ├── KafkaTransactionSender.java │ │ │ │ └── MessageContentPartitioner.java │ │ │ ├── source/ │ │ │ │ ├── ConsumerMetadata.java │ │ │ │ ├── KafkaEventTimeDeserializationSchema.java │ │ │ │ ├── KafkaPartitionSplitReader.java │ │ │ │ ├── KafkaRecordEmitter.java │ │ │ │ ├── KafkaSource.java │ │ │ │ ├── KafkaSourceConfig.java │ │ │ │ ├── KafkaSourceFactory.java │ │ │ │ ├── KafkaSourceReader.java │ │ │ │ ├── KafkaSourceSplit.java │ │ │ │ ├── KafkaSourceSplitEnumerator.java │ │ │ │ ├── KafkaSourceSplitState.java │ │ │ │ └── fetch/ │ │ │ │ └── KafkaSourceFetcherManager.java │ │ │ └── state/ │ │ │ ├── KafkaAggregatedCommitInfo.java │ │ │ ├── KafkaCommitInfo.java │ │ │ ├── KafkaSinkState.java │ │ │ └── KafkaSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ ├── kafka/ │ │ │ └── clients/ │ │ │ └── admin/ │ │ │ └── KafkaSourceSplitEnumeratorTest.java │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── kafka/ │ │ ├── KafkaFactoryTest.java │ │ ├── KafkaStartOffsetTest.java │ │ ├── serialize/ │ │ │ └── DefaultSeaTunnelRowSerializerTest.java │ │ └── source/ │ │ ├── KafkaRecordEmitterTest.java │ │ └── KafkaSourceConfigTest.java │ ├── connector-kudu/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── kudu/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── KuduCatalog.java │ │ │ │ │ └── KuduCatalogFactory.java │ │ │ │ ├── config/ │ │ │ │ │ ├── CommonConfig.java │ │ │ │ │ ├── KuduBaseOptions.java │ │ │ │ │ ├── KuduSinkConfig.java │ │ │ │ │ ├── KuduSinkOptions.java │ │ │ │ │ ├── KuduSourceConfig.java │ │ │ │ │ ├── KuduSourceOptions.java │ │ │ │ │ └── KuduSourceTableConfig.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── KuduConnectorErrorCode.java │ │ │ │ │ └── KuduConnectorException.java │ │ │ │ ├── kuduclient/ │ │ │ │ │ ├── KuduInputFormat.java │ │ │ │ │ ├── KuduOutputFormat.java │ │ │ │ │ └── KuduTypeMapper.java │ │ │ │ ├── serialize/ │ │ │ │ │ ├── KuduRowSerializer.java │ │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── KuduSink.java │ │ │ │ │ ├── KuduSinkFactory.java │ │ │ │ │ └── KuduSinkWriter.java │ │ │ │ ├── source/ │ │ │ │ │ ├── KuduSource.java │ │ │ │ │ ├── KuduSourceFactory.java │ │ │ │ │ ├── KuduSourceReader.java │ │ │ │ │ ├── KuduSourceSplit.java │ │ │ │ │ └── KuduSourceSplitEnumerator.java │ │ │ │ ├── state/ │ │ │ │ │ ├── KuduAggregatedCommitInfo.java │ │ │ │ │ ├── KuduCommitInfo.java │ │ │ │ │ ├── KuduSinkState.java │ │ │ │ │ └── KuduSourceState.java │ │ │ │ └── util/ │ │ │ │ └── KuduUtil.java │ │ │ └── resources/ │ │ │ ├── kudu_to_kudu_flink.conf │ │ │ └── kudu_to_kudu_spark.conf │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── kudu/ │ │ ├── KuduFactoryTest.java │ │ ├── catalog/ │ │ │ └── KuduCatalogTest.java │ │ └── config/ │ │ └── KuduSourceTableConfigTest.java │ ├── connector-lance/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── lance/ │ │ │ ├── catalog/ │ │ │ │ ├── LanceCatalog.java │ │ │ │ ├── LanceCatalogFactory.java │ │ │ │ ├── LanceCatalogLoader.java │ │ │ │ └── LanceNamespaceType.java │ │ │ ├── config/ │ │ │ │ ├── LanceCommonConfig.java │ │ │ │ ├── LanceCommonOptions.java │ │ │ │ ├── LanceSinkConfig.java │ │ │ │ └── LanceSinkOptions.java │ │ │ ├── data/ │ │ │ │ └── LanceTypeMapper.java │ │ │ ├── exception/ │ │ │ │ ├── LanceConnectorErrorCode.java │ │ │ │ └── LanceConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── LanceSink.java │ │ │ │ ├── LanceSinkFactory.java │ │ │ │ ├── LanceSinkWriter.java │ │ │ │ ├── commit/ │ │ │ │ │ ├── LanceAggregatedCommitInfo.java │ │ │ │ │ └── LanceCommitInfo.java │ │ │ │ └── writers/ │ │ │ │ ├── BaseTypeWriter.java │ │ │ │ ├── BinaryTypeWriter.java │ │ │ │ ├── BoolTypeWriter.java │ │ │ │ ├── DateTypeWriter.java │ │ │ │ ├── DecimalTypeWriter.java │ │ │ │ ├── FloatingPointTypeWriter.java │ │ │ │ ├── IntTypeWriter.java │ │ │ │ ├── ListTypeWriter.java │ │ │ │ ├── MapTypeWriter.java │ │ │ │ ├── TimestampTypeWriter.java │ │ │ │ ├── TypeWriter.java │ │ │ │ ├── TypeWriterFactory.java │ │ │ │ └── Utf8TypeWriter.java │ │ │ ├── state/ │ │ │ │ └── LanceSinkState.java │ │ │ └── utils/ │ │ │ ├── FragmentConverter.java │ │ │ └── SchemaUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── lance/ │ │ ├── LanceFactoryTest.java │ │ ├── namespace/ │ │ │ └── LanceCatalogTest.java │ │ └── sink/ │ │ └── LanceSinkTest.java │ ├── connector-maxcompute/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── maxcompute/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── MaxComputeCatalog.java │ │ │ │ │ ├── MaxComputeCatalogFactory.java │ │ │ │ │ ├── MaxComputeCatalogUtil.java │ │ │ │ │ └── MaxComputeDataTypeConvertor.java │ │ │ │ ├── config/ │ │ │ │ │ ├── MaxcomputeBaseOptions.java │ │ │ │ │ ├── MaxcomputeSinkOptions.java │ │ │ │ │ └── MaxcomputeSourceOptions.java │ │ │ │ ├── datatype/ │ │ │ │ │ └── MaxComputeTypeConverter.java │ │ │ │ ├── exception/ │ │ │ │ │ └── MaxcomputeConnectorException.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── MaxComputeSaveModeHandler.java │ │ │ │ │ ├── MaxcomputeSink.java │ │ │ │ │ ├── MaxcomputeSinkFactory.java │ │ │ │ │ └── MaxcomputeWriter.java │ │ │ │ ├── source/ │ │ │ │ │ ├── MaxcomputeSource.java │ │ │ │ │ ├── MaxcomputeSourceFactory.java │ │ │ │ │ ├── MaxcomputeSourceReader.java │ │ │ │ │ ├── MaxcomputeSourceSplit.java │ │ │ │ │ ├── MaxcomputeSourceSplitEnumerator.java │ │ │ │ │ ├── MaxcomputeSourceState.java │ │ │ │ │ └── SourceTableInfo.java │ │ │ │ └── util/ │ │ │ │ ├── CreateTableParser.java │ │ │ │ ├── FormatterContext.java │ │ │ │ ├── MaxcomputeOutputFormat.java │ │ │ │ ├── MaxcomputeTypeMapper.java │ │ │ │ └── MaxcomputeUtil.java │ │ │ └── resources/ │ │ │ └── maxcompute_to_maxcompute.conf │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── maxcompute/ │ │ ├── BasicTypeToOdpsTypeTest.java │ │ ├── MaxcomputeSourceFactoryTest.java │ │ ├── catalog/ │ │ │ ├── MaxComputeCatalogUtilTest.java │ │ │ ├── MaxComputeCreateTableTest.java │ │ │ ├── MaxComputeDataTypeConvertorTest.java │ │ │ └── PreviewActionTest.java │ │ ├── datatype/ │ │ │ └── MaxComputeTypeConvertorTest.java │ │ └── source/ │ │ └── MaxcomputeSourceTest.java │ ├── connector-milvus/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── milvus/ │ │ │ ├── catalog/ │ │ │ │ ├── MilvusCatalog.java │ │ │ │ ├── MilvusCatalogFactory.java │ │ │ │ └── MilvusOptions.java │ │ │ ├── config/ │ │ │ │ ├── MilvusBaseOptions.java │ │ │ │ ├── MilvusSinkOptions.java │ │ │ │ └── MilvusSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── MilvusConnectionErrorCode.java │ │ │ │ └── MilvusConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── MilvusBufferBatchWriter.java │ │ │ │ ├── MilvusSink.java │ │ │ │ ├── MilvusSinkCommitter.java │ │ │ │ ├── MilvusSinkFactory.java │ │ │ │ └── MilvusSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── MilvusSource.java │ │ │ │ ├── MilvusSourceFactory.java │ │ │ │ ├── MilvusSourceReader.java │ │ │ │ ├── MilvusSourceSplit.java │ │ │ │ ├── MilvusSourceSplitEnumerator.java │ │ │ │ └── MilvusSourceState.java │ │ │ ├── state/ │ │ │ │ ├── MilvusAggregatedCommitInfo.java │ │ │ │ ├── MilvusCommitInfo.java │ │ │ │ └── MilvusSinkState.java │ │ │ └── utils/ │ │ │ ├── MilvusConnectorUtils.java │ │ │ ├── MilvusConvertUtils.java │ │ │ ├── sink/ │ │ │ │ └── MilvusSinkConverter.java │ │ │ └── source/ │ │ │ └── MilvusSourceConverter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── milvus/ │ │ ├── catalog/ │ │ │ └── MilvusCatalogTest.java │ │ └── utils/ │ │ ├── MilvusConvertUtilsTest.java │ │ └── sink/ │ │ └── MilvusSinkConverterTest.java │ ├── connector-mongodb/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── mongodb/ │ │ │ ├── catalog/ │ │ │ │ ├── MongodbCatalog.java │ │ │ │ └── MongodbCatalogFactory.java │ │ │ ├── config/ │ │ │ │ ├── MongodbBaseOptions.java │ │ │ │ ├── MongodbSinkOptions.java │ │ │ │ └── MongodbSourceOptions.java │ │ │ ├── exception/ │ │ │ │ └── MongodbConnectorException.java │ │ │ ├── internal/ │ │ │ │ ├── MongodbClientProvider.java │ │ │ │ ├── MongodbCollectionProvider.java │ │ │ │ └── MongodbSingleCollectionProvider.java │ │ │ ├── serde/ │ │ │ │ ├── BsonToRowDataConverters.java │ │ │ │ ├── DocumentDeserializer.java │ │ │ │ ├── DocumentRowDataDeserializer.java │ │ │ │ ├── DocumentSerializer.java │ │ │ │ ├── RowDataDocumentSerializer.java │ │ │ │ ├── RowDataToBsonConverters.java │ │ │ │ └── SerializableFunction.java │ │ │ ├── sink/ │ │ │ │ ├── MongoKeyExtractor.java │ │ │ │ ├── MongodbSink.java │ │ │ │ ├── MongodbSinkFactory.java │ │ │ │ ├── MongodbWriter.java │ │ │ │ ├── MongodbWriterOptions.java │ │ │ │ ├── commit/ │ │ │ │ │ ├── CommittableTransaction.java │ │ │ │ │ ├── CommittableUpsertTransaction.java │ │ │ │ │ └── MongodbSinkAggregatedCommitter.java │ │ │ │ ├── savemode/ │ │ │ │ │ └── MongodbSaveModeHandler.java │ │ │ │ └── state/ │ │ │ │ ├── DocumentBulk.java │ │ │ │ ├── MongodbAggregatedCommitInfo.java │ │ │ │ └── MongodbCommitInfo.java │ │ │ └── source/ │ │ │ ├── MongodbSource.java │ │ │ ├── MongodbSourceFactory.java │ │ │ ├── config/ │ │ │ │ └── MongodbReadOptions.java │ │ │ ├── enumerator/ │ │ │ │ └── MongodbSplitEnumerator.java │ │ │ ├── reader/ │ │ │ │ └── MongodbReader.java │ │ │ └── split/ │ │ │ ├── MongoSplit.java │ │ │ ├── MongoSplitStrategy.java │ │ │ ├── MongoSplitUtils.java │ │ │ └── SamplingSplitStrategy.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── mongodb/ │ │ ├── MongodbFactoryTest.java │ │ ├── serde/ │ │ │ └── BsonToRowDataConvertersTest.java │ │ └── source/ │ │ └── split/ │ │ └── SamplingSplitStrategyTest.java │ ├── connector-neo4j/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── neo4j/ │ │ │ ├── config/ │ │ │ │ ├── DriverBuilder.java │ │ │ │ ├── Neo4jBaseOptions.java │ │ │ │ ├── Neo4jQueryInfo.java │ │ │ │ ├── Neo4jSinkOptions.java │ │ │ │ ├── Neo4jSinkQueryInfo.java │ │ │ │ ├── Neo4jSourceOptions.java │ │ │ │ └── Neo4jSourceQueryInfo.java │ │ │ ├── constants/ │ │ │ │ ├── CypherEnum.java │ │ │ │ └── SinkWriteMode.java │ │ │ ├── exception/ │ │ │ │ ├── Neo4jConnectorErrorCode.java │ │ │ │ └── Neo4jConnectorException.java │ │ │ ├── internal/ │ │ │ │ └── SeaTunnelRowNeo4jValue.java │ │ │ ├── sink/ │ │ │ │ ├── Neo4jSink.java │ │ │ │ ├── Neo4jSinkFactory.java │ │ │ │ └── Neo4jSinkWriter.java │ │ │ └── source/ │ │ │ ├── Neo4jSource.java │ │ │ ├── Neo4jSourceFactory.java │ │ │ └── Neo4jSourceReader.java │ │ └── test/ │ │ └── java/ │ │ ├── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── neo4j/ │ │ │ └── Neo4jFactoryTest.java │ │ └── org.apache.seatunnel.connectors.seatunnel.neo4j.source/ │ │ └── Neo4jSourceReaderTest.java │ ├── connector-openmldb/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── openmldb/ │ │ │ ├── config/ │ │ │ │ ├── OpenMldbParameters.java │ │ │ │ ├── OpenMldbSourceOptions.java │ │ │ │ └── OpenMldbSqlExecutor.java │ │ │ ├── exception/ │ │ │ │ └── OpenMldbConnectorException.java │ │ │ └── source/ │ │ │ ├── OpenMldbSource.java │ │ │ ├── OpenMldbSourceFactory.java │ │ │ └── OpenMldbSourceReader.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── openmldb/ │ │ └── OpenMldbFactoryTest.java │ ├── connector-paimon/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── paimon/ │ │ │ │ ├── catalog/ │ │ │ │ │ ├── PaimonCatalog.java │ │ │ │ │ ├── PaimonCatalogEnum.java │ │ │ │ │ ├── PaimonCatalogFactory.java │ │ │ │ │ ├── PaimonCatalogLoader.java │ │ │ │ │ └── PaimonTable.java │ │ │ │ ├── config/ │ │ │ │ │ ├── PaimonBaseOptions.java │ │ │ │ │ ├── PaimonConfig.java │ │ │ │ │ ├── PaimonHadoopConfiguration.java │ │ │ │ │ ├── PaimonSinkConfig.java │ │ │ │ │ ├── PaimonSinkOptions.java │ │ │ │ │ ├── PaimonSourceConfig.java │ │ │ │ │ ├── PaimonSourceOptions.java │ │ │ │ │ └── PaimonSourceTableConfig.java │ │ │ │ ├── data/ │ │ │ │ │ └── PaimonTypeMapper.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── PaimonConnectorErrorCode.java │ │ │ │ │ └── PaimonConnectorException.java │ │ │ │ ├── filesystem/ │ │ │ │ │ └── S3Loader.java │ │ │ │ ├── handler/ │ │ │ │ │ └── PaimonSaveModeHandler.java │ │ │ │ ├── security/ │ │ │ │ │ └── PaimonSecurityContext.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── PaimonSink.java │ │ │ │ │ ├── PaimonSinkFactory.java │ │ │ │ │ ├── PaimonSinkWriter.java │ │ │ │ │ ├── SupportLoadTable.java │ │ │ │ │ ├── bucket/ │ │ │ │ │ │ ├── PaimonBucketAssigner.java │ │ │ │ │ │ ├── PaimonBucketAssignerFactory.java │ │ │ │ │ │ └── RowAssignerChannelComputer.java │ │ │ │ │ ├── commit/ │ │ │ │ │ │ ├── PaimonAggregatedCommitInfo.java │ │ │ │ │ │ ├── PaimonAggregatedCommitter.java │ │ │ │ │ │ └── PaimonCommitInfo.java │ │ │ │ │ ├── schema/ │ │ │ │ │ │ ├── UpdatedDataFields.java │ │ │ │ │ │ └── handler/ │ │ │ │ │ │ └── AlterPaimonTableSchemaEventHandler.java │ │ │ │ │ └── state/ │ │ │ │ │ └── PaimonSinkState.java │ │ │ │ ├── source/ │ │ │ │ │ ├── PaimonSource.java │ │ │ │ │ ├── PaimonSourceFactory.java │ │ │ │ │ ├── PaimonSourceReader.java │ │ │ │ │ ├── PaimonSourceSplit.java │ │ │ │ │ ├── PaimonSourceSplitGenerator.java │ │ │ │ │ ├── PaimonSourceState.java │ │ │ │ │ ├── converter/ │ │ │ │ │ │ └── SqlToPaimonPredicateConverter.java │ │ │ │ │ └── enumerator/ │ │ │ │ │ ├── AbstractSplitEnumerator.java │ │ │ │ │ ├── PaimonBatchSourceSplitEnumerator.java │ │ │ │ │ └── PaimonStreamSourceSplitEnumerator.java │ │ │ │ └── utils/ │ │ │ │ ├── RowConverter.java │ │ │ │ ├── RowKindConverter.java │ │ │ │ ├── RowTypeConverter.java │ │ │ │ └── SchemaUtil.java │ │ │ └── resources/ │ │ │ └── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.paimon.fs.FileIOLoader │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── paimon/ │ │ ├── catalog/ │ │ │ ├── PaimonCatalogPrimaryTest.java │ │ │ ├── PaimonCatalogTest.java │ │ │ ├── PaimonPrivilegeCatalogTest.java │ │ │ └── PaimonWithCommentTest.java │ │ ├── config/ │ │ │ └── PaimonSourceTableConfigTest.java │ │ ├── sink/ │ │ │ ├── bucket/ │ │ │ │ └── PaimonBucketAssignerTest.java │ │ │ ├── schema/ │ │ │ │ └── UpdatedDataFieldsTest.java │ │ │ └── writer/ │ │ │ └── PaimonWriteTest.java │ │ ├── source/ │ │ │ ├── PaimonDynamicOptionsTest.java │ │ │ └── converter/ │ │ │ └── SqlToPaimonConverterTest.java │ │ └── utils/ │ │ ├── RowConverterTest.java │ │ ├── RowTypeConverterTest.java │ │ └── SchemaUtilTest.java │ ├── connector-prometheus/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── prometheus/ │ │ │ ├── Exception/ │ │ │ │ └── PrometheusConnectorException.java │ │ │ ├── config/ │ │ │ │ ├── PrometheusQueryType.java │ │ │ │ ├── PrometheusSinkConfig.java │ │ │ │ ├── PrometheusSinkOptions.java │ │ │ │ ├── PrometheusSourceConfig.java │ │ │ │ ├── PrometheusSourceOptions.java │ │ │ │ └── PrometheusSourceParameter.java │ │ │ ├── pojo/ │ │ │ │ ├── InstantPoint.java │ │ │ │ └── RangePoint.java │ │ │ ├── serialize/ │ │ │ │ ├── PrometheusSerializer.java │ │ │ │ └── Serializer.java │ │ │ ├── sink/ │ │ │ │ ├── Point.java │ │ │ │ ├── PrometheusSink.java │ │ │ │ ├── PrometheusSinkFactory.java │ │ │ │ ├── PrometheusWriter.java │ │ │ │ └── proto/ │ │ │ │ ├── GoGoProtos.java │ │ │ │ ├── Remote.java │ │ │ │ └── Types.java │ │ │ └── source/ │ │ │ ├── PrometheusSource.java │ │ │ ├── PrometheusSourceFactory.java │ │ │ └── PrometheusSourceReader.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── prometheus/ │ │ ├── PrometheusFactoryTest.java │ │ └── PrometheusParamCheckTest.java │ ├── connector-pulsar/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── pulsar/ │ │ │ ├── config/ │ │ │ │ ├── BasePulsarConfig.java │ │ │ │ ├── PulsarAdminConfig.java │ │ │ │ ├── PulsarBaseOptions.java │ │ │ │ ├── PulsarClientConfig.java │ │ │ │ ├── PulsarConfigUtil.java │ │ │ │ ├── PulsarConsumerConfig.java │ │ │ │ ├── PulsarSemantics.java │ │ │ │ ├── PulsarSinkOptions.java │ │ │ │ └── PulsarSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── PulsarConnectorErrorCode.java │ │ │ │ └── PulsarConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── PulsarSink.java │ │ │ │ ├── PulsarSinkCommitter.java │ │ │ │ ├── PulsarSinkFactory.java │ │ │ │ └── PulsarSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── PulsarSource.java │ │ │ │ ├── PulsarSourceFactory.java │ │ │ │ ├── enumerator/ │ │ │ │ │ ├── PulsarSplitEnumerator.java │ │ │ │ │ ├── PulsarSplitEnumeratorState.java │ │ │ │ │ ├── cursor/ │ │ │ │ │ │ ├── start/ │ │ │ │ │ │ │ ├── MessageIdStartCursor.java │ │ │ │ │ │ │ ├── StartCursor.java │ │ │ │ │ │ │ ├── SubscriptionStartCursor.java │ │ │ │ │ │ │ └── TimestampStartCursor.java │ │ │ │ │ │ └── stop/ │ │ │ │ │ │ ├── LatestMessageStopCursor.java │ │ │ │ │ │ ├── MessageIdStopCursor.java │ │ │ │ │ │ ├── NeverStopCursor.java │ │ │ │ │ │ ├── StopCursor.java │ │ │ │ │ │ └── TimestampStopCursor.java │ │ │ │ │ ├── discoverer/ │ │ │ │ │ │ ├── PulsarDiscoverer.java │ │ │ │ │ │ ├── TopicListDiscoverer.java │ │ │ │ │ │ └── TopicPatternDiscoverer.java │ │ │ │ │ └── topic/ │ │ │ │ │ └── TopicPartition.java │ │ │ │ ├── format/ │ │ │ │ │ └── PulsarCanalDecorator.java │ │ │ │ ├── reader/ │ │ │ │ │ ├── PulsarSourceReader.java │ │ │ │ │ ├── PulsarSplitReaderThread.java │ │ │ │ │ └── RecordWithSplitId.java │ │ │ │ └── split/ │ │ │ │ └── PulsarPartitionSplit.java │ │ │ └── state/ │ │ │ ├── PulsarAggregatedCommitInfo.java │ │ │ ├── PulsarCommitInfo.java │ │ │ └── PulsarSinkState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── pulsar/ │ │ └── source/ │ │ ├── PulsarCanalDecoratorTest.java │ │ └── PulsarSourceFactoryTest.java │ ├── connector-qdrant/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── qdrant/ │ │ ├── config/ │ │ │ ├── QdrantBaseOptions.java │ │ │ ├── QdrantParameters.java │ │ │ ├── QdrantSinkOptions.java │ │ │ └── QdrantSourceOptions.java │ │ ├── exception/ │ │ │ └── QdrantConnectorException.java │ │ ├── sink/ │ │ │ ├── QdrantBatchWriter.java │ │ │ ├── QdrantSink.java │ │ │ ├── QdrantSinkFactory.java │ │ │ └── QdrantSinkWriter.java │ │ └── source/ │ │ ├── QdrantSource.java │ │ ├── QdrantSourceFactory.java │ │ └── QdrantSourceReader.java │ ├── connector-rabbitmq/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── rabbitmq/ │ │ │ ├── client/ │ │ │ │ ├── QueueingConsumer.java │ │ │ │ └── RabbitmqClient.java │ │ │ ├── config/ │ │ │ │ ├── RabbitmqBaseOptions.java │ │ │ │ ├── RabbitmqConfig.java │ │ │ │ ├── RabbitmqSinkOptions.java │ │ │ │ └── RabbitmqSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── RabbitmqConnectorErrorCode.java │ │ │ │ └── RabbitmqConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── RabbitmqSink.java │ │ │ │ ├── RabbitmqSinkFactory.java │ │ │ │ └── RabbitmqSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── DeliveryMessage.java │ │ │ │ ├── RabbitmqSource.java │ │ │ │ ├── RabbitmqSourceFactory.java │ │ │ │ ├── RabbitmqSourceReader.java │ │ │ │ ├── RabbitmqSourceState.java │ │ │ │ └── RabbitmqSplitEnumerator.java │ │ │ └── split/ │ │ │ ├── RabbitmqSplit.java │ │ │ └── RabbitmqSplitEnumeratorState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── rabbitmq/ │ │ └── RabbitmqFactoryTest.java │ ├── connector-redis/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── redis/ │ │ │ ├── client/ │ │ │ │ ├── RedisClient.java │ │ │ │ ├── RedisClusterClient.java │ │ │ │ └── RedisSingleClient.java │ │ │ ├── config/ │ │ │ │ ├── JedisWrapper.java │ │ │ │ ├── RedisBaseOptions.java │ │ │ │ ├── RedisContainerInfo.java │ │ │ │ ├── RedisDataType.java │ │ │ │ ├── RedisParameters.java │ │ │ │ ├── RedisSinkOptions.java │ │ │ │ └── RedisSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── RedisConnectorException.java │ │ │ │ └── RedisErrorCode.java │ │ │ ├── sink/ │ │ │ │ ├── RedisSink.java │ │ │ │ ├── RedisSinkFactory.java │ │ │ │ └── RedisSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── KeyedRecordReader.java │ │ │ │ ├── RedisRecordReader.java │ │ │ │ ├── RedisSource.java │ │ │ │ ├── RedisSourceFactory.java │ │ │ │ ├── RedisSourceReader.java │ │ │ │ └── UnKeyedRecordReader.java │ │ │ └── util/ │ │ │ ├── JsonKeyValueMerger.java │ │ │ ├── KeyValueMerger.java │ │ │ └── KeyValueMergerFactory.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── redis/ │ │ ├── Redis5Test.java │ │ ├── Redis7Test.java │ │ ├── RedisFactoryTest.java │ │ ├── RedisTemplateTest.java │ │ ├── row/ │ │ │ ├── TestForDeleteRows.java │ │ │ └── TestKeyOrValueIsNullRows.java │ │ └── sink/ │ │ └── RedisSinkWriterTest.java │ ├── connector-rocketmq/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── rocketmq/ │ │ ├── common/ │ │ │ ├── RocketMqAdminUtil.java │ │ │ ├── RocketMqBaseConfiguration.java │ │ │ ├── SchemaFormat.java │ │ │ └── StartMode.java │ │ ├── config/ │ │ │ ├── RocketMqBaseOptions.java │ │ │ ├── RocketMqSinkOptions.java │ │ │ └── RocketMqSourceOptions.java │ │ ├── exception/ │ │ │ ├── RocketMqConnectorErrorCode.java │ │ │ └── RocketMqConnectorException.java │ │ ├── serialize/ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ └── SeaTunnelRowSerializer.java │ │ ├── sink/ │ │ │ ├── ProducerMetadata.java │ │ │ ├── RocketMqNoTransactionSender.java │ │ │ ├── RocketMqProducerSender.java │ │ │ ├── RocketMqSink.java │ │ │ ├── RocketMqSinkFactory.java │ │ │ ├── RocketMqSinkWriter.java │ │ │ └── RocketMqTransactionSender.java │ │ └── source/ │ │ ├── ConsumerMetadata.java │ │ ├── RocketMqConsumerThread.java │ │ ├── RocketMqSource.java │ │ ├── RocketMqSourceFactory.java │ │ ├── RocketMqSourceReader.java │ │ ├── RocketMqSourceSplit.java │ │ ├── RocketMqSourceSplitEnumerator.java │ │ └── RocketMqSourceState.java │ ├── connector-s3-redshift/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── redshift/ │ │ ├── RedshiftJdbcClient.java │ │ ├── commit/ │ │ │ └── S3RedshiftSinkAggregatedCommitter.java │ │ ├── config/ │ │ │ └── S3RedshiftConfigOptions.java │ │ ├── exception/ │ │ │ ├── S3RedshiftConnectorErrorCode.java │ │ │ └── S3RedshiftJdbcConnectorException.java │ │ └── sink/ │ │ ├── S3RedshiftFactory.java │ │ └── S3RedshiftSink.java │ ├── connector-selectdb-cloud/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── selectdb/ │ │ │ ├── config/ │ │ │ │ ├── SelectDBConfig.java │ │ │ │ └── SelectDBSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── SelectDBConnectorErrorCode.java │ │ │ │ └── SelectDBConnectorException.java │ │ │ ├── rest/ │ │ │ │ ├── BaseResponse.java │ │ │ │ ├── CopyIntoResp.java │ │ │ │ └── CopySQLUtil.java │ │ │ ├── serialize/ │ │ │ │ ├── SeaTunnelRowConverter.java │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ └── SelectDBSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── EscapeHandler.java │ │ │ │ ├── SelectDBSink.java │ │ │ │ ├── SelectDBSinkFactory.java │ │ │ │ ├── committer/ │ │ │ │ │ ├── SelectDBCommitInfo.java │ │ │ │ │ ├── SelectDBCommitInfoSerializer.java │ │ │ │ │ └── SelectDBCommitter.java │ │ │ │ └── writer/ │ │ │ │ ├── CopySQLBuilder.java │ │ │ │ ├── LabelGenerator.java │ │ │ │ ├── LoadConstants.java │ │ │ │ ├── LoadStatus.java │ │ │ │ ├── RecordBuffer.java │ │ │ │ ├── SelectDBSinkState.java │ │ │ │ ├── SelectDBSinkStateSerializer.java │ │ │ │ ├── SelectDBSinkWriter.java │ │ │ │ └── SelectDBStageLoad.java │ │ │ └── util/ │ │ │ ├── HttpPostBuilder.java │ │ │ ├── HttpPutBuilder.java │ │ │ ├── HttpUtil.java │ │ │ └── ResponseUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── selectdb/ │ │ └── serialize/ │ │ ├── SeaTunnelRowConverterTest.java │ │ └── SelectDBConfigSerializableTest.java │ ├── connector-sensorsdata/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── sensorsdata/ │ │ │ ├── format/ │ │ │ │ ├── SensorsDataTypes.java │ │ │ │ ├── config/ │ │ │ │ │ ├── SensorsDataBaseOptionRules.java │ │ │ │ │ ├── SensorsDataConfigBase.java │ │ │ │ │ ├── SensorsDataOptions.java │ │ │ │ │ └── TargetColumnConfig.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── SensorsDataErrorCode.java │ │ │ │ │ └── SensorsDataException.java │ │ │ │ ├── record/ │ │ │ │ │ ├── RowAccessor.java │ │ │ │ │ ├── SensorsDataJsonKeys.java │ │ │ │ │ ├── SensorsDataLibInfo.java │ │ │ │ │ ├── SensorsDataRecord.java │ │ │ │ │ ├── SensorsDataRecordBuilder.java │ │ │ │ │ ├── SensorsDataRecordType.java │ │ │ │ │ ├── SpecialItemRecord.java │ │ │ │ │ ├── UserDetailRecord.java │ │ │ │ │ ├── UserEventRecord.java │ │ │ │ │ ├── UserRecord.java │ │ │ │ │ └── UserRecordBase.java │ │ │ │ └── utils/ │ │ │ │ ├── TypeUtil.java │ │ │ │ └── UserSchemaUtil.java │ │ │ └── sdk/ │ │ │ ├── config/ │ │ │ │ ├── SensorsDataSDKSinkConfig.java │ │ │ │ └── SensorsDataSDKSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── SensorsDataConnectorErrorCode.java │ │ │ │ └── SensorsDataConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── SensorsDataSDKSink.java │ │ │ │ ├── SensorsDataSDKSinkFactory.java │ │ │ │ └── SensorsDataSDKWriter.java │ │ │ └── state/ │ │ │ ├── SensorsDataAggregatedCommitInfo.java │ │ │ ├── SensorsDataCommitInfo.java │ │ │ └── SensorsDataSinkState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── sensorsdata/ │ │ ├── format/ │ │ │ ├── SensorsDataTypesTest.java │ │ │ ├── record/ │ │ │ │ ├── SensorsDataSpecialItemRecordTest.java │ │ │ │ └── SensorsDataUserRecordTest.java │ │ │ └── utils/ │ │ │ └── TypeUtilTest.java │ │ └── sdk/ │ │ └── SensorsDataSDKFactoryTest.java │ ├── connector-sentry/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── sentry/ │ │ │ ├── config/ │ │ │ │ └── SentrySinkOptions.java │ │ │ ├── exception/ │ │ │ │ └── SentryConnectorException.java │ │ │ └── sink/ │ │ │ ├── SentrySink.java │ │ │ ├── SentrySinkFactory.java │ │ │ └── SentrySinkWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── sentry/ │ │ └── SentryFactoryTest.java │ ├── connector-slack/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── slack/ │ │ │ ├── client/ │ │ │ │ └── SlackClient.java │ │ │ ├── config/ │ │ │ │ └── SlackSinkOptions.java │ │ │ ├── exception/ │ │ │ │ ├── SlackConnectorErrorCode.java │ │ │ │ └── SlackConnectorException.java │ │ │ └── sink/ │ │ │ ├── SlackSink.java │ │ │ ├── SlackSinkFactory.java │ │ │ └── SlackWriter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── slack/ │ │ └── SlackFactoryTest.java │ ├── connector-sls/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── sls/ │ │ │ ├── config/ │ │ │ │ ├── SlsBaseOptions.java │ │ │ │ ├── SlsSinkOptions.java │ │ │ │ ├── SlsSourceOptions.java │ │ │ │ └── StartMode.java │ │ │ ├── serialization/ │ │ │ │ ├── FastLogDeserialization.java │ │ │ │ ├── FastLogDeserializationContent.java │ │ │ │ ├── FastLogDeserializationSchema.java │ │ │ │ └── SeatunnelRowSerialization.java │ │ │ ├── sink/ │ │ │ │ ├── SlsSink.java │ │ │ │ ├── SlsSinkCommitter.java │ │ │ │ ├── SlsSinkFactory.java │ │ │ │ └── SlsSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── ConsumerMetaData.java │ │ │ │ ├── SlsConsumerThread.java │ │ │ │ ├── SlsSource.java │ │ │ │ ├── SlsSourceConfig.java │ │ │ │ ├── SlsSourceFactory.java │ │ │ │ ├── SlsSourceReader.java │ │ │ │ ├── SlsSourceSplit.java │ │ │ │ └── SlsSourceSplitEnumerator.java │ │ │ └── state/ │ │ │ ├── SlsAggregatedCommitInfo.java │ │ │ ├── SlsCommitInfo.java │ │ │ ├── SlsSinkState.java │ │ │ └── SlsSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── sls/ │ │ └── SlsFactoryTest.java │ ├── connector-socket/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── socket/ │ │ │ ├── config/ │ │ │ │ ├── SocketCommonOptions.java │ │ │ │ ├── SocketConfig.java │ │ │ │ ├── SocketSinkOptions.java │ │ │ │ └── SocketSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── SocketConnectorErrorCode.java │ │ │ │ └── SocketConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── SocketClient.java │ │ │ │ ├── SocketSink.java │ │ │ │ ├── SocketSinkFactory.java │ │ │ │ └── SocketSinkWriter.java │ │ │ └── source/ │ │ │ ├── SocketSource.java │ │ │ ├── SocketSourceFactory.java │ │ │ └── SocketSourceReader.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── socket/ │ │ └── SocketFactoryTest.java │ ├── connector-starrocks/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── starrocks/ │ │ │ ├── catalog/ │ │ │ │ ├── StarRocksCatalog.java │ │ │ │ ├── StarRocksCatalogFactory.java │ │ │ │ └── StarRocksDataTypeConvertor.java │ │ │ ├── client/ │ │ │ │ ├── HttpHelper.java │ │ │ │ ├── StarRocksFlushTuple.java │ │ │ │ ├── StarRocksSinkManager.java │ │ │ │ ├── StarRocksStreamLoadVisitor.java │ │ │ │ └── source/ │ │ │ │ ├── StarRocksBeReadClient.java │ │ │ │ ├── StarRocksQueryPlanReadClient.java │ │ │ │ └── model/ │ │ │ │ ├── Column.java │ │ │ │ ├── QueryBeXTablets.java │ │ │ │ ├── QueryInfo.java │ │ │ │ ├── QueryPartition.java │ │ │ │ ├── QueryPlan.java │ │ │ │ └── Tablet.java │ │ │ ├── config/ │ │ │ │ ├── SinkConfig.java │ │ │ │ ├── SourceConfig.java │ │ │ │ ├── StarRocksBaseOptions.java │ │ │ │ ├── StarRocksConfig.java │ │ │ │ ├── StarRocksSinkOptions.java │ │ │ │ ├── StarRocksSourceOptions.java │ │ │ │ └── StarRocksSourceTableConfig.java │ │ │ ├── datatypes/ │ │ │ │ ├── StarRocksType.java │ │ │ │ └── StarRocksTypeConverter.java │ │ │ ├── exception/ │ │ │ │ ├── StarRocksConnectorErrorCode.java │ │ │ │ └── StarRocksConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── StarRocksBaseSerializer.java │ │ │ │ ├── StarRocksCsvSerializer.java │ │ │ │ ├── StarRocksDelimiterParser.java │ │ │ │ ├── StarRocksISerializer.java │ │ │ │ ├── StarRocksJsonSerializer.java │ │ │ │ └── StarRocksSinkOP.java │ │ │ ├── sink/ │ │ │ │ ├── StarRocksSaveModeUtil.java │ │ │ │ ├── StarRocksSink.java │ │ │ │ ├── StarRocksSinkFactory.java │ │ │ │ └── StarRocksSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── StarRocksSource.java │ │ │ │ ├── StarRocksSourceFactory.java │ │ │ │ ├── StarRocksSourceReader.java │ │ │ │ ├── StarRocksSourceSplit.java │ │ │ │ ├── StarRocksSourceState.java │ │ │ │ └── StartRocksSourceSplitEnumerator.java │ │ │ └── util/ │ │ │ └── SchemaUtils.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── starrocks/ │ │ ├── StarRocksFactoryTest.java │ │ ├── catalog/ │ │ │ ├── DataTypeConvertorTest.java │ │ │ ├── PreviewActionTest.java │ │ │ ├── StarRocksCreateTableTest.java │ │ │ └── StarRocksTypeConverterTest.java │ │ ├── client/ │ │ │ ├── StarRocksSinkManagerTest.java │ │ │ └── StarRocksStreamLoadVisitorTest.java │ │ ├── serialize/ │ │ │ └── StarRocksJsonSerializerTest.java │ │ └── sink/ │ │ └── StarRocksSaveModeUtilTest.java │ ├── connector-tablestore/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── tablestore/ │ │ │ ├── config/ │ │ │ │ ├── TableStoreCommonOptions.java │ │ │ │ ├── TableStoreConfig.java │ │ │ │ ├── TableStoreSinkOptions.java │ │ │ │ └── TableStoreSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── TablestoreConnectorErrorCode.java │ │ │ │ └── TablestoreConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── DefaultSeaTunnelRowSerializer.java │ │ │ │ ├── SeaTunnelRowDeserializer.java │ │ │ │ └── SeaTunnelRowSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── TableStoreSink.java │ │ │ │ ├── TableStoreSinkClient.java │ │ │ │ ├── TableStoreSinkFactory.java │ │ │ │ └── TableStoreWriter.java │ │ │ └── source/ │ │ │ ├── TableStoreProcessor.java │ │ │ ├── TableStoreSource.java │ │ │ ├── TableStoreSourceFactory.java │ │ │ ├── TableStoreSourceReader.java │ │ │ ├── TableStoreSourceSplit.java │ │ │ ├── TableStoreSourceSplitEnumerator.java │ │ │ └── TableStoreSourceState.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── tablestore/ │ │ └── TableStoreFactoryTest.java │ ├── connector-tdengine/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── tdengine/ │ │ │ ├── config/ │ │ │ │ ├── TDengineCommonOptions.java │ │ │ │ ├── TDengineSinkConfig.java │ │ │ │ ├── TDengineSinkOptions.java │ │ │ │ ├── TDengineSourceConfig.java │ │ │ │ └── TDengineSourceOptions.java │ │ │ ├── exception/ │ │ │ │ ├── TDengineConnectorErrorCode.java │ │ │ │ └── TDengineConnectorException.java │ │ │ ├── sink/ │ │ │ │ ├── TDengineSink.java │ │ │ │ ├── TDengineSinkFactory.java │ │ │ │ └── TDengineSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── StableMetadata.java │ │ │ │ ├── TDengineSource.java │ │ │ │ ├── TDengineSourceFactory.java │ │ │ │ ├── TDengineSourceReader.java │ │ │ │ ├── TDengineSourceSplit.java │ │ │ │ └── TDengineSourceSplitEnumerator.java │ │ │ ├── state/ │ │ │ │ └── TDengineSourceState.java │ │ │ ├── typemapper/ │ │ │ │ └── TDengineTypeMapper.java │ │ │ └── utils/ │ │ │ └── TDengineUtil.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── tdengine/ │ │ ├── TDengineTest.java │ │ ├── sink/ │ │ │ └── TDengineSinkWriterTest.java │ │ ├── source/ │ │ │ └── TDengineSourceReaderTest.java │ │ └── typemapper/ │ │ └── TDengineTypeMapperTest.java │ ├── connector-typesense/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── connectors/ │ │ │ └── seatunnel/ │ │ │ └── typesense/ │ │ │ ├── catalog/ │ │ │ │ ├── TypesenseCatalog.java │ │ │ │ ├── TypesenseCatalogFactory.java │ │ │ │ └── TypesenseTypeConverter.java │ │ │ ├── client/ │ │ │ │ ├── TypesenseClient.java │ │ │ │ └── TypesenseType.java │ │ │ ├── config/ │ │ │ │ ├── TypesenseBaseOptions.java │ │ │ │ ├── TypesenseSinkOptions.java │ │ │ │ └── TypesenseSourceOptions.java │ │ │ ├── dto/ │ │ │ │ ├── CollectionInfo.java │ │ │ │ └── SourceCollectionInfo.java │ │ │ ├── exception/ │ │ │ │ ├── TypesenseConnectorErrorCode.java │ │ │ │ └── TypesenseConnectorException.java │ │ │ ├── serialize/ │ │ │ │ ├── KeyExtractor.java │ │ │ │ ├── sink/ │ │ │ │ │ ├── SeaTunnelRowSerializer.java │ │ │ │ │ ├── TypesenseRowSerializer.java │ │ │ │ │ └── collection/ │ │ │ │ │ ├── CollectionSerializer.java │ │ │ │ │ └── FixedValueCollectionSerializer.java │ │ │ │ └── source/ │ │ │ │ ├── DefaultSeaTunnelRowDeserializer.java │ │ │ │ ├── SeaTunnelRowDeserializer.java │ │ │ │ └── TypesenseRecord.java │ │ │ ├── sink/ │ │ │ │ ├── TypesenseSink.java │ │ │ │ ├── TypesenseSinkFactory.java │ │ │ │ └── TypesenseSinkWriter.java │ │ │ ├── source/ │ │ │ │ ├── TypesenseSource.java │ │ │ │ ├── TypesenseSourceFactory.java │ │ │ │ ├── TypesenseSourceReader.java │ │ │ │ ├── TypesenseSourceSplit.java │ │ │ │ ├── TypesenseSourceSplitEnumerator.java │ │ │ │ └── TypesenseSourceState.java │ │ │ ├── state/ │ │ │ │ ├── TypesenseAggregatedCommitInfo.java │ │ │ │ ├── TypesenseCommitInfo.java │ │ │ │ └── TypesenseSinkState.java │ │ │ └── util/ │ │ │ └── URLParamsConverter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ └── typesense/ │ │ ├── serializer/ │ │ │ └── TypesenseRowSerializerTest.java │ │ ├── sink/ │ │ │ └── TypesenseFactoryTest.java │ │ └── util/ │ │ └── URLParamsConverterTest.java │ ├── connector-web3j/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── connectors/ │ │ └── seatunnel/ │ │ ├── config/ │ │ │ └── Web3jSourceOptions.java │ │ └── source/ │ │ ├── Web3jSource.java │ │ ├── Web3jSourceFactory.java │ │ ├── Web3jSourceParameter.java │ │ └── Web3jSourceReader.java │ └── pom.xml ├── seatunnel-core/ │ ├── README.md │ ├── pom.xml │ ├── seatunnel-core-starter/ │ │ ├── README.md │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── core/ │ │ │ │ └── starter/ │ │ │ │ ├── SeaTunnel.java │ │ │ │ ├── Starter.java │ │ │ │ ├── command/ │ │ │ │ │ ├── AbstractCommandArgs.java │ │ │ │ │ ├── Command.java │ │ │ │ │ ├── CommandArgs.java │ │ │ │ │ ├── ConfDecryptCommand.java │ │ │ │ │ ├── ConfEncryptCommand.java │ │ │ │ │ ├── ParameterSplitter.java │ │ │ │ │ └── UsageFormatter.java │ │ │ │ ├── constants/ │ │ │ │ │ └── SeaTunnelStarterConstants.java │ │ │ │ ├── enums/ │ │ │ │ │ └── MasterType.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── CommandException.java │ │ │ │ │ ├── CommandExecuteException.java │ │ │ │ │ ├── ConfigCheckException.java │ │ │ │ │ └── TaskExecuteException.java │ │ │ │ ├── execution/ │ │ │ │ │ ├── PluginExecuteProcessor.java │ │ │ │ │ ├── RuntimeEnvironment.java │ │ │ │ │ ├── SourceTableInfo.java │ │ │ │ │ └── TaskExecution.java │ │ │ │ ├── flowcontrol/ │ │ │ │ │ ├── FlowControlGate.java │ │ │ │ │ └── FlowControlStrategy.java │ │ │ │ └── utils/ │ │ │ │ ├── CommandLineUtils.java │ │ │ │ ├── CompressionUtils.java │ │ │ │ ├── ConfigAdapterUtils.java │ │ │ │ ├── ConfigBuilder.java │ │ │ │ ├── ConfigShadeUtils.java │ │ │ │ └── FileUtils.java │ │ │ └── resources/ │ │ │ └── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.seatunnel.api.configuration.ConfigShade │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ ├── command/ │ │ │ │ ├── ConfDecryptCommandTest.java │ │ │ │ └── ConfEncryptCommandTest.java │ │ │ ├── execution/ │ │ │ │ └── RuntimeEnvironmentTest.java │ │ │ ├── flowcontrol/ │ │ │ │ └── FlowControlGateTest.java │ │ │ └── utils/ │ │ │ ├── CompressionUtilsTest.java │ │ │ ├── ConfigBuilderTest.java │ │ │ ├── ConfigShadeTest.java │ │ │ └── FileUtilsTest.java │ │ └── resources/ │ │ ├── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.seatunnel.api.configuration.ConfigShade │ │ ├── config.shade.conf │ │ ├── config.shade.json │ │ ├── config.shade_caseNull.conf │ │ ├── config.shade_with_props.json │ │ ├── config.shade_with_transform.json │ │ ├── config.variables.conf │ │ ├── config_table_list_variables.conf │ │ ├── config_variables_with_default_value.conf │ │ ├── config_variables_with_reserved_placeholder.conf │ │ ├── flink.batch.conf │ │ ├── log4j2.properties │ │ ├── origin.conf │ │ └── shade.conf │ ├── seatunnel-flink-starter/ │ │ ├── pom.xml │ │ ├── seatunnel-flink-13-starter/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── bin/ │ │ │ │ ├── start-seatunnel-flink-13-connector-v2.cmd │ │ │ │ └── start-seatunnel-flink-13-connector-v2.sh │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── flink/ │ │ │ ├── FlinkStarter.java │ │ │ ├── SeaTunnelFlink.java │ │ │ └── execution/ │ │ │ ├── FlinkRuntimeEnvironment.java │ │ │ └── SinkExecuteProcessor.java │ │ ├── seatunnel-flink-15-starter/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── bin/ │ │ │ │ ├── start-seatunnel-flink-15-connector-v2.cmd │ │ │ │ └── start-seatunnel-flink-15-connector-v2.sh │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── core/ │ │ │ │ └── starter/ │ │ │ │ └── flink/ │ │ │ │ ├── FlinkCommandArgsTest.java │ │ │ │ └── multitable/ │ │ │ │ └── MultiTableSinkTest.java │ │ │ └── resources/ │ │ │ └── config/ │ │ │ ├── fake_to_inmemory.json │ │ │ ├── fake_to_inmemory_without_pluginname.json │ │ │ └── inmemory_to_inmemory_multi_table.conf │ │ ├── seatunnel-flink-20-starter/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ ├── bin/ │ │ │ │ ├── start-seatunnel-flink-20-connector-v2.cmd │ │ │ │ └── start-seatunnel-flink-20-connector-v2.sh │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── flink/ │ │ │ ├── FlinkStarter.java │ │ │ ├── SeaTunnelFlink.java │ │ │ └── execution/ │ │ │ └── SinkExecuteProcessor.java │ │ └── seatunnel-flink-starter-common/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── flink/ │ │ │ ├── AbstractFlinkStarter.java │ │ │ ├── AbstractSeaTunnelFlink.java │ │ │ ├── FlinkStarter.java │ │ │ ├── SeaTunnelFlink.java │ │ │ ├── args/ │ │ │ │ └── FlinkCommandArgs.java │ │ │ ├── command/ │ │ │ │ ├── FlinkConfValidateCommand.java │ │ │ │ └── FlinkTaskExecuteCommand.java │ │ │ ├── execution/ │ │ │ │ ├── AbstractFlinkRuntimeEnvironment.java │ │ │ │ ├── AbstractSinkExecuteProcessor.java │ │ │ │ ├── DataStreamTableInfo.java │ │ │ │ ├── FlinkAbstractPluginExecuteProcessor.java │ │ │ │ ├── FlinkExecution.java │ │ │ │ ├── FlinkRuntimeEnvironment.java │ │ │ │ ├── SinkExecuteProcessor.java │ │ │ │ ├── SourceExecuteProcessor.java │ │ │ │ └── TransformExecuteProcessor.java │ │ │ └── utils/ │ │ │ ├── ConfigKeyName.java │ │ │ ├── EnvironmentUtil.java │ │ │ └── TableUtil.java │ │ └── test/ │ │ └── java/ │ │ ├── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── flink/ │ │ │ └── TestFlinkParameter.java │ │ └── resources/ │ │ └── test_flink_run_parameter.conf │ ├── seatunnel-spark-starter/ │ │ ├── pom.xml │ │ ├── seatunnel-spark-2-starter/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ ├── bin/ │ │ │ │ │ ├── start-seatunnel-spark-2-connector-v2.cmd │ │ │ │ │ └── start-seatunnel-spark-2-connector-v2.sh │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── core/ │ │ │ │ │ └── starter/ │ │ │ │ │ └── spark/ │ │ │ │ │ ├── SeaTunnelSpark.java │ │ │ │ │ ├── SparkStarter.java │ │ │ │ │ └── execution/ │ │ │ │ │ └── SinkExecuteProcessor.java │ │ │ │ └── resources/ │ │ │ │ └── spark_application.conf │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── spark/ │ │ │ ├── SparkStarterTest.java │ │ │ ├── args/ │ │ │ │ └── SparkCommandArgsTest.java │ │ │ └── utils/ │ │ │ └── CommandLineUtilsTest.java │ │ ├── seatunnel-spark-3-starter/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── bin/ │ │ │ │ ├── start-seatunnel-spark-3-connector-v2.cmd │ │ │ │ └── start-seatunnel-spark-3-connector-v2.sh │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── core/ │ │ │ │ └── starter/ │ │ │ │ └── spark/ │ │ │ │ ├── SparkCommandArgsTest.java │ │ │ │ └── multitable/ │ │ │ │ └── MultiTableSinkTest.java │ │ │ └── resources/ │ │ │ └── config/ │ │ │ ├── fake_to_inmemory.json │ │ │ ├── fake_to_inmemory_without_pluginname.json │ │ │ ├── inmemory_to_inmemory_multi_table.conf │ │ │ └── source_parallelism_set_2.conf │ │ └── seatunnel-spark-starter-common/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── core/ │ │ └── starter/ │ │ └── spark/ │ │ ├── SeaTunnelSpark.java │ │ ├── SparkStarter.java │ │ ├── args/ │ │ │ └── SparkCommandArgs.java │ │ ├── command/ │ │ │ ├── SparkConfValidateCommand.java │ │ │ └── SparkTaskExecuteCommand.java │ │ └── execution/ │ │ ├── SinkExecuteProcessor.java │ │ ├── SourceExecuteProcessor.java │ │ ├── SparkAbstractPluginExecuteProcessor.java │ │ ├── SparkExecution.java │ │ ├── SparkRuntimeEnvironment.java │ │ └── TransformExecuteProcessor.java │ └── seatunnel-starter/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── bin/ │ │ │ ├── seatunnel-cluster.cmd │ │ │ ├── seatunnel-cluster.sh │ │ │ ├── seatunnel-connector.cmd │ │ │ ├── seatunnel-connector.sh │ │ │ ├── seatunnel.cmd │ │ │ ├── seatunnel.sh │ │ │ ├── stop-seatunnel-cluster.cmd │ │ │ └── stop-seatunnel-cluster.sh │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── seatunnel/ │ │ │ ├── SeaTunnelClient.java │ │ │ ├── SeaTunnelConnector.java │ │ │ ├── SeaTunnelServer.java │ │ │ ├── args/ │ │ │ │ ├── ClientCommandArgs.java │ │ │ │ ├── ConnectorCheckCommandArgs.java │ │ │ │ └── ServerCommandArgs.java │ │ │ └── command/ │ │ │ ├── ClientExecuteCommand.java │ │ │ ├── ConnectorCheckCommand.java │ │ │ ├── SeaTunnelConfValidateCommand.java │ │ │ └── ServerExecuteCommand.java │ │ └── resources/ │ │ └── log4j2.properties │ └── test/ │ ├── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── core/ │ │ └── starter/ │ │ └── seatunnel/ │ │ ├── SeaTunnelClientOOMTest.java │ │ ├── args/ │ │ │ ├── ClientCommandArgsTest.java │ │ │ └── ConnectorCheckCommandArgsTest.java │ │ ├── command/ │ │ │ ├── ConnectorCheckCommandTest.java │ │ │ └── ServerExecuteCommandTest.java │ │ └── multitable/ │ │ └── MultiTableSinkTest.java │ └── resources/ │ ├── args/ │ │ └── user_defined_params.conf │ └── config/ │ ├── fake_to_inmemory.json │ ├── fake_to_inmemory_oom.json │ ├── fake_to_inmemory_without_pluginname.json │ └── inmemory_to_inmemory_multi_table.conf ├── seatunnel-dist/ │ ├── pom.xml │ ├── release-docs/ │ │ ├── LICENSE │ │ ├── NOTICE │ │ └── licenses/ │ │ ├── LICENSE-accessors-smart.txt │ │ ├── LICENSE-animal-sniffer-annotations.txt │ │ ├── LICENSE-asm.txt │ │ ├── LICENSE-avro.txt │ │ ├── LICENSE-checker-qual.txt │ │ ├── LICENSE-codec-commons-codec.txt │ │ ├── LICENSE-commons-beanutils.txt │ │ ├── LICENSE-commons-cli.txt │ │ ├── LICENSE-commons-collections.txt │ │ ├── LICENSE-commons-compress.txt │ │ ├── LICENSE-commons-configuration2.txt │ │ ├── LICENSE-commons-io.txt │ │ ├── LICENSE-commons-lang.txt │ │ ├── LICENSE-commons-lang3.txt │ │ ├── LICENSE-commons-math3.txt │ │ ├── LICENSE-commons-net.txt │ │ ├── LICENSE-connons-math.txt │ │ ├── LICENSE-curator-client.txt │ │ ├── LICENSE-curator-framework.txt │ │ ├── LICENSE-curator-recipes.txt │ │ ├── LICENSE-error-prone-annotations.txt │ │ ├── LICENSE-findbugs-jsr305.txt │ │ ├── LICENSE-gson.txt │ │ ├── LICENSE-guava.txt │ │ ├── LICENSE-hadoop-annotations.txt │ │ ├── LICENSE-hadoop-auth.txt │ │ ├── LICENSE-hadoop-client.txt │ │ ├── LICENSE-hadoop-common.txt │ │ ├── LICENSE-hadoop-hdfs-client.txt │ │ ├── LICENSE-hadoop-mapreduce-client-common.txt │ │ ├── LICENSE-hadoop-mapreduce-client-core.txt │ │ ├── LICENSE-hadoop-yarn-api.txt │ │ ├── LICENSE-hadoop-yarn-client.txt │ │ ├── LICENSE-hadoop-yarn-common.txt │ │ ├── LICENSE-htrace-core4.txt │ │ ├── LICENSE-httpclient.txt │ │ ├── LICENSE-hugegraph-client.txt │ │ ├── LICENSE-j2objc-annotations.txt │ │ ├── LICENSE-jackson-annotations.txt │ │ ├── LICENSE-jackson-core-asl.txt │ │ ├── LICENSE-jackson-core.txt │ │ ├── LICENSE-jackson-databind.txt │ │ ├── LICENSE-jackson-mapper-asl.txt │ │ ├── LICENSE-javax-annootation-api.txt │ │ ├── LICENSE-javax.servlet-api.txt │ │ ├── LICENSE-jaxb-api.txt │ │ ├── LICENSE-jcip-annotations.txt │ │ ├── LICENSE-jersey-client.txt │ │ ├── LICENSE-jersey-core.txt │ │ ├── LICENSE-jersey-servlet.txt │ │ ├── LICENSE-jetty-security.txt │ │ ├── LICENSE-jetty-servlet.txt │ │ ├── LICENSE-jetty-util.txt │ │ ├── LICENSE-jetty-webapp.txt │ │ ├── LICENSE-jetty-xml.txt │ │ ├── LICENSE-jose-jwt.txt │ │ ├── LICENSE-json-smart.txt │ │ ├── LICENSE-jsr311-api.txt │ │ ├── LICENSE-kerb-admin.txt │ │ ├── LICENSE-kerb-client.txt │ │ ├── LICENSE-kerb-common.txt │ │ ├── LICENSE-kerb-core.txt │ │ ├── LICENSE-kerb-crypto.txt │ │ ├── LICENSE-kerb-identity.txt │ │ ├── LICENSE-kerb-server.txt │ │ ├── LICENSE-kerb-simplekdc.txt │ │ ├── LICENSE-kerb-util.txt │ │ ├── LICENSE-kerby-asn1.txt │ │ ├── LICENSE-kerby-config.txt │ │ ├── LICENSE-kerby-pkix.txt │ │ ├── LICENSE-kerby-util.txt │ │ ├── LICENSE-kerby-xdr.txt │ │ ├── LICENSE-log4j-1.2-api.txt │ │ ├── LICENSE-log4j-api.txt │ │ ├── LICENSE-log4j-core.txt │ │ ├── LICENSE-log4j-slf4j-impl.txt │ │ ├── LICENSE-mapreduce-client-jobclient.txt │ │ ├── LICENSE-orc.txt │ │ ├── LICENSE-parquet-format.txt │ │ ├── LICENSE-parquet-mr.txt │ │ ├── LICENSE-protobuf-java.txt │ │ ├── LICENSE-protobuf.txt │ │ ├── LICENSE-protoc-jar.txt │ │ ├── LICENSE-re2j.txt │ │ ├── LICENSE-scala.txt │ │ ├── LICENSE-sjf4j.txt │ │ ├── LICENSE-snappy-java.txt │ │ ├── LICENSE-spark.txt │ │ ├── LICENSE-stax2-api.txt │ │ ├── LICENSE-token-provider.txt │ │ ├── LICENSE-woodstox-core.txt │ │ ├── LICENSE-xz.txt │ │ └── LICENSE-yetus.txt │ └── src/ │ ├── main/ │ │ ├── assembly/ │ │ │ ├── assembly-bin-ci.xml │ │ │ ├── assembly-bin.xml │ │ │ └── assembly-src.xml │ │ └── docker/ │ │ └── Dockerfile │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── api/ │ └── connector/ │ ├── ConnectorSpecificationCheckTest.java │ └── TransformSpecificationCheckTest.java ├── seatunnel-e2e/ │ ├── pom.xml │ ├── seatunnel-connector-v2-e2e/ │ │ ├── connector-activemq-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── activemq/ │ │ │ │ └── ActivemqIT.java │ │ │ └── resources/ │ │ │ ├── e2e.json │ │ │ ├── fake_source_to_sink.conf │ │ │ └── localfile_source_to_sink.conf │ │ ├── connector-aerospike-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── aerospike/ │ │ │ │ ├── AbstractAerospikeIT.java │ │ │ │ ├── Aerospike6IT.java │ │ │ │ └── AerospikeContainerInfo.java │ │ │ └── resources/ │ │ │ └── fake_to_aerospike_sink.conf │ │ ├── connector-amazondynamodb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── amazondynamodb/ │ │ │ │ └── AmazondynamodbIT.java │ │ │ └── resources/ │ │ │ └── amazondynamodbIT_source_to_sink.conf │ │ ├── connector-amazonsqs-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── amazonsqs/ │ │ │ │ └── AmazonsqsIT.java │ │ │ └── resources/ │ │ │ └── amazonsqsIT_source_to_sink.conf │ │ ├── connector-assert-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── assertion/ │ │ │ │ └── FakeSourceToAssertIT.java │ │ │ └── resources/ │ │ │ └── assertion/ │ │ │ ├── fake_full_types_to_assert.conf │ │ │ ├── fake_row_to_assert.conf │ │ │ ├── fakesource_to_assert.conf │ │ │ └── fakesource_to_multi_table_assert.conf │ │ ├── connector-cassandra-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cassandra/ │ │ │ │ └── CassandraIT.java │ │ │ └── resources/ │ │ │ ├── application.conf │ │ │ ├── cassandra_to_cassandra.conf │ │ │ └── init/ │ │ │ └── cassandra_init.conf │ │ ├── connector-cdc-mongodb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── mongodb/ │ │ │ │ ├── MongoDBContainer.java │ │ │ │ ├── MongodbCDCIT.java │ │ │ │ └── MongodbCDCMultiSourceIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── inventory.js │ │ │ │ ├── inventoryClean.js │ │ │ │ ├── inventoryDDL.js │ │ │ │ └── mongodb_cdc.sql │ │ │ ├── docker/ │ │ │ │ └── mongodb/ │ │ │ │ ├── random.key │ │ │ │ └── setup.js │ │ │ ├── log4j2-test.properties │ │ │ ├── mongodb_multi_source_a.conf │ │ │ ├── mongodb_multi_source_b.conf │ │ │ ├── mongodb_multi_table_cdc_to_mysql.conf │ │ │ ├── mongodbcdc_metadata_trans.conf │ │ │ ├── mongodbcdc_to_mysql.conf │ │ │ └── mongodbcdc_to_mysql_orders.conf │ │ ├── connector-cdc-mysql-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── mysql/ │ │ │ │ ├── AbstractMysqlCDCITBase.java │ │ │ │ ├── Mysql8_4CDCIT.java │ │ │ │ ├── MysqlCDCIT.java │ │ │ │ ├── MysqlCDCSpecificStartingOffsetIT.java │ │ │ │ ├── MysqlCDCWithBinlogDeleteIT.java │ │ │ │ ├── MysqlCDCWithFlinkSchemaChangeIT.java │ │ │ │ └── MysqlCDCWithSchemaChangeIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ ├── change_columns.sql │ │ │ │ ├── drop_columns.sql │ │ │ │ ├── inventory.sql │ │ │ │ ├── modify_columns.sql │ │ │ │ ├── mysql_cdc.sql │ │ │ │ ├── rename_columns.sql │ │ │ │ ├── shop.sql │ │ │ │ ├── wildcards.sql │ │ │ │ └── wildcards_dml.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ ├── my.cnf │ │ │ │ │ └── my8-4.cnf │ │ │ │ └── setup.sql │ │ │ ├── log4j2-test.properties │ │ │ ├── mysqlcdc_earliest_offset.conf │ │ │ ├── mysqlcdc_specific_offset.conf │ │ │ ├── mysqlcdc_timestamp_offset.conf │ │ │ ├── mysqlcdc_to_metadata_trans.conf │ │ │ ├── mysqlcdc_to_mysql.conf │ │ │ ├── mysqlcdc_to_mysql_with_binlog_delete.conf │ │ │ ├── mysqlcdc_to_mysql_with_custom_primary_key.conf │ │ │ ├── mysqlcdc_to_mysql_with_disable_exactly_once.conf │ │ │ ├── mysqlcdc_to_mysql_with_flink_schema_change.conf │ │ │ ├── mysqlcdc_to_mysql_with_heartbeat.conf │ │ │ ├── mysqlcdc_to_mysql_with_multi_table_mode_one_table.conf │ │ │ ├── mysqlcdc_to_mysql_with_multi_table_mode_two_table.conf │ │ │ ├── mysqlcdc_to_mysql_with_no_primary_key.conf │ │ │ ├── mysqlcdc_to_mysql_with_schema_change.conf │ │ │ ├── mysqlcdc_to_mysql_with_schema_change_exactly_once.conf │ │ │ └── mysqlcdc_wildcards_to_mysql.conf │ │ ├── connector-cdc-opengauss-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── postgres/ │ │ │ │ └── OpengaussCDCIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ └── inventory.sql │ │ │ ├── opengausscdc_to_meatadata_trans.conf │ │ │ ├── opengausscdc_to_opengauss.conf │ │ │ ├── opengausscdc_to_opengauss_test_add_Filed.conf │ │ │ ├── opengausscdc_to_opengauss_with_custom_primary_key.conf │ │ │ ├── opengausscdc_to_opengauss_with_multi_table_mode_one_table.conf │ │ │ ├── opengausscdc_to_opengauss_with_multi_table_mode_two_table.conf │ │ │ └── opengausscdc_to_opengauss_with_no_primary_key.conf │ │ ├── connector-cdc-oracle-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── oracle/ │ │ │ │ ├── AbstractOracleCDCIT.java │ │ │ │ ├── OracleCDCIT.java │ │ │ │ ├── OracleCDCWithSchemaChangeIT.java │ │ │ │ └── OracleContainer.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ ├── column_type_test.sql │ │ │ │ ├── drop_columns.sql │ │ │ │ ├── full_types.sql │ │ │ │ ├── modify_columns.sql │ │ │ │ └── rename_columns.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ ├── log4j2-test.properties │ │ │ ├── oraclecdc_to_metadata_trans.conf │ │ │ ├── oraclecdc_to_mysql_with_schema_change.conf │ │ │ ├── oraclecdc_to_oracle.conf │ │ │ ├── oraclecdc_to_oracle_skip_analysis.conf │ │ │ ├── oraclecdc_to_oracle_timestamp.conf │ │ │ ├── oraclecdc_to_oracle_use_select_count.conf │ │ │ ├── oraclecdc_to_oracle_with_custom_primary_key.conf │ │ │ ├── oraclecdc_to_oracle_with_heartbeat.conf │ │ │ ├── oraclecdc_to_oracle_with_multi_table_mode_one_table.conf │ │ │ ├── oraclecdc_to_oracle_with_multi_table_mode_two_table.conf │ │ │ ├── oraclecdc_to_oracle_with_no_primary_key.conf │ │ │ ├── oraclecdc_to_oracle_with_partition.conf │ │ │ ├── oraclecdc_to_oracle_with_schema_change.conf │ │ │ └── oraclecdc_to_oracle_with_schema_change_exactly_once.conf │ │ ├── connector-cdc-postgres-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── cdc/ │ │ │ │ └── postgres/ │ │ │ │ └── PostgresCDCIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ └── inventory.sql │ │ │ ├── pgcdc_to_pg_with_multi_table_mode_one_table.conf │ │ │ ├── pgcdc_to_pg_with_multi_table_mode_two_table.conf │ │ │ ├── postgrescdc_to_metadata_trans.conf │ │ │ ├── postgrescdc_to_postgres.conf │ │ │ ├── postgrescdc_to_postgres_test_add_Filed.conf │ │ │ ├── postgrescdc_to_postgres_with_custom_primary_key.conf │ │ │ ├── postgrescdc_to_postgres_with_debezium_to_kafka.conf │ │ │ ├── postgrescdc_to_postgres_with_heartbeat.conf │ │ │ ├── postgrescdc_to_postgres_with_interval_data_type.conf │ │ │ ├── postgrescdc_to_postgres_with_network_address_types.conf │ │ │ └── postgrescdc_to_postgres_with_no_primary_key.conf │ │ ├── connector-cdc-sqlserver-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── cdc/ │ │ │ │ └── sqlserver/ │ │ │ │ └── SqlServerCDCIT.java │ │ │ └── resources/ │ │ │ ├── container-license-acceptance.txt │ │ │ ├── ddl/ │ │ │ │ ├── column_type_test.sql │ │ │ │ └── test_db_name.sql │ │ │ ├── sqlservercdc_earliest_to_sqlserver.conf │ │ │ ├── sqlservercdc_special_db_name.conf │ │ │ ├── sqlservercdc_to_console.conf │ │ │ ├── sqlservercdc_to_console_with_heartbeat.conf │ │ │ ├── sqlservercdc_to_metadata_trans.conf │ │ │ ├── sqlservercdc_to_sqlserver_timestamp.conf │ │ │ ├── sqlservercdc_to_sqlserver_with_custom_primary_key.conf │ │ │ └── sqlservercdc_to_sqlserver_with_no_primary_key.conf │ │ ├── connector-cdc-tidb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── tidb/ │ │ │ │ ├── TiDBCDCIT.java │ │ │ │ └── TiDBTestBase.java │ │ │ └── resources/ │ │ │ ├── config/ │ │ │ │ ├── pd.toml │ │ │ │ ├── tidb.toml │ │ │ │ └── tikv.toml │ │ │ ├── ddl/ │ │ │ │ └── tidb_cdc.sql │ │ │ └── tidb/ │ │ │ ├── tidbcdc_to_tidb.conf │ │ │ ├── tidbcdc_to_tidb_with_disable_exactly_once.conf │ │ │ └── tidbcdc_to_tidb_with_no_primary_key.conf │ │ ├── connector-clickhouse-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── clickhouse/ │ │ │ │ ├── ClickhouseIT.java │ │ │ │ └── ClickhouseSinkCDCChangelogIT.java │ │ │ └── resources/ │ │ │ ├── clickhouse_auto_create_with_special_comments.conf │ │ │ ├── clickhouse_sink_cdc_changelog_case1.conf │ │ │ ├── clickhouse_sink_cdc_changelog_case2.conf │ │ │ ├── clickhouse_sink_cdc_changelog_log_engine.conf │ │ │ ├── clickhouse_to_clickhouse.conf │ │ │ ├── clickhouse_to_console.conf │ │ │ ├── clickhouse_with_create_schema_when_comment.conf │ │ │ ├── clickhouse_with_create_schema_when_not_exist.conf │ │ │ ├── clickhouse_with_create_schema_when_not_exist_and_drop_data.conf │ │ │ ├── clickhouse_with_error_when_data_exists.conf │ │ │ ├── clickhouse_with_error_when_schema_not_exist.conf │ │ │ ├── clickhouse_with_join_complex_sql.conf │ │ │ ├── clickhouse_with_multi_table_source.conf │ │ │ ├── clickhouse_with_parallelism_add_filter_query.conf │ │ │ ├── clickhouse_with_parallelism_add_partition_list.conf │ │ │ ├── clickhouse_with_parallelism_read.conf │ │ │ ├── clickhouse_with_recreate_schema_and_append_data.conf │ │ │ ├── clickhouse_with_recreate_schema_and_custom.conf │ │ │ ├── clickhouse_with_sql_and_filter_query.conf │ │ │ ├── fake_to_clickhouse.conf │ │ │ ├── fake_to_clickhouse_with_multi_table.conf │ │ │ └── init/ │ │ │ └── clickhouse_init.conf │ │ ├── connector-databend-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── databend/ │ │ │ │ ├── DatabendCDCSinkIT.java │ │ │ │ ├── DatabendIT.java │ │ │ │ └── DatabendTestUtils.java │ │ │ └── resources/ │ │ │ └── databend/ │ │ │ ├── databend_init.conf │ │ │ ├── databend_schema_evolution.conf │ │ │ ├── databend_sink.conf │ │ │ ├── databend_source.conf │ │ │ ├── databend_to_console.conf │ │ │ ├── databend_to_databend.conf │ │ │ ├── fake_to_databend.conf │ │ │ └── fake_to_databend_cdc.conf │ │ ├── connector-datahub-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── datahub/ │ │ │ │ └── DatahubIT.java │ │ │ └── resources/ │ │ │ ├── fakesource_to_datahub.conf │ │ │ └── fakesource_to_multi_datahub.conf │ │ ├── connector-doris-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── doris/ │ │ │ │ ├── AbstractDorisIT.java │ │ │ │ ├── DorisCDCSinkIT.java │ │ │ │ ├── DorisCatalogIT.java │ │ │ │ ├── DorisErrorIT.java │ │ │ │ ├── DorisIT.java │ │ │ │ ├── DorisMultiReadIT.java │ │ │ │ ├── DorisSchemaChangeIT.java │ │ │ │ └── GenerateTestData.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ ├── change_columns.sql │ │ │ │ ├── drop_columns.sql │ │ │ │ ├── modify_columns.sql │ │ │ │ ├── mysql_cdc.sql │ │ │ │ └── shop.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ ├── doris_multi_source_to_assert.conf │ │ │ ├── doris_multi_source_to_sink.conf │ │ │ ├── doris_multi_source_to_sink_2pc_false.conf │ │ │ ├── doris_source_and_sink.conf │ │ │ ├── doris_source_and_sink_2pc_false.conf │ │ │ ├── doris_source_and_sink_with_custom_sql.conf │ │ │ ├── doris_source_no_schema.conf │ │ │ ├── doris_source_to_doris_sink_type_convertor.conf │ │ │ ├── fake_source_and_doris_sink_timeout_error.conf │ │ │ ├── mysqlcdc_to_doris_with_schema_change.conf │ │ │ └── write-cdc-changelog-to-doris.conf │ │ ├── connector-druid-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── druid/ │ │ │ │ └── DruidIT.java │ │ │ └── resources/ │ │ │ ├── docker-compose.yml │ │ │ ├── environment │ │ │ ├── fakesource_to_druid.conf │ │ │ └── fakesource_to_druid_with_multi.conf │ │ ├── connector-easysearch-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── easysearch/ │ │ │ │ └── EasysearchIT.java │ │ │ └── resources/ │ │ │ └── easysearch/ │ │ │ ├── easysearch_source_and_sink.conf │ │ │ └── easysearch_source_and_sink_with_save_mode.conf │ │ ├── connector-elasticsearch-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── elasticsearch/ │ │ │ │ ├── ElasticsearchAuthIT.java │ │ │ │ ├── ElasticsearchIT.java │ │ │ │ └── ElasticsearchSchemaChangeIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ └── shop.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ └── elasticsearch/ │ │ │ ├── elasticsearch_multi_source_and_sink_by_filter.conf │ │ │ ├── elasticsearch_source_and_sink.conf │ │ │ ├── elasticsearch_source_and_sink_full_type.conf │ │ │ ├── elasticsearch_source_and_sink_with_nest.conf │ │ │ ├── elasticsearch_source_with_pit.conf │ │ │ ├── elasticsearch_source_with_runtime_fields.conf │ │ │ ├── elasticsearch_source_with_sql.conf │ │ │ ├── elasticsearch_source_without_schema_and_sink.conf │ │ │ ├── fake-to-elasticsearch-vector.conf │ │ │ ├── fakesource_to_elasticsearch_multi_sink.conf │ │ │ ├── fakesource_to_elasticsearch_with_upper_case_index.conf │ │ │ ├── mysqlcdc_to_elasticsearch_with_schema_change.conf │ │ │ ├── st_index_full_type_data.json │ │ │ ├── st_index_full_type_mapping.json │ │ │ ├── st_index_nest_data.json │ │ │ ├── st_index_nest_mapping.json │ │ │ ├── st_index_source_without_schema_and_sink.json │ │ │ └── st_index_with_sql.json │ │ ├── connector-email-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── email/ │ │ │ │ └── EmailWithMultiIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_email.conf │ │ │ ├── fake_to_email_test.conf │ │ │ └── fake_to_multiemailsink.conf │ │ ├── connector-fake-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── fake/ │ │ │ │ ├── FakeIT.java │ │ │ │ ├── FakeSqlConfIT.java │ │ │ │ ├── FakeWithMultiTableTT.java │ │ │ │ ├── FakeWithSchemaTT.java │ │ │ │ ├── FakeWithTableNamesTT.java │ │ │ │ └── FlinkMetricsIT.java │ │ │ └── resources/ │ │ │ ├── fake_generic_row_type_to_assert.conf │ │ │ ├── fake_to_assert.conf │ │ │ ├── fake_to_assert.sql │ │ │ ├── fake_to_assert_verify_flink_metrics.conf │ │ │ ├── fake_to_assert_with_catalogtable.conf │ │ │ ├── fake_to_assert_with_compatible_source_and_result_table_name.conf │ │ │ ├── fake_to_assert_with_multitable_exception.conf │ │ │ ├── fake_to_assert_with_range.conf │ │ │ ├── fake_to_assert_with_tablenames.conf │ │ │ ├── fake_to_assert_with_template.conf │ │ │ └── fake_to_console_with_multitable_mode.conf │ │ ├── connector-file-cos-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── cos/ │ │ │ │ └── CosFileIT.java │ │ │ └── resources/ │ │ │ ├── excel/ │ │ │ │ ├── cos_excel_to_assert.conf │ │ │ │ └── fake_to_cos_excel.conf │ │ │ ├── json/ │ │ │ │ ├── cos_file_json_to_assert.conf │ │ │ │ └── fake_to_cos_file_json.conf │ │ │ ├── orc/ │ │ │ │ ├── cos_file_orc_to_assert.conf │ │ │ │ └── fake_to_cos_file_orc.conf │ │ │ ├── parquet/ │ │ │ │ ├── cos_file_parquet_to_assert.conf │ │ │ │ └── fake_to_cos_file_parquet.conf │ │ │ └── text/ │ │ │ ├── cos_file_text_to_assert.conf │ │ │ └── fake_to_cos_file_text.conf │ │ ├── connector-file-ftp-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── ftp/ │ │ │ │ └── FtpFileIT.java │ │ │ └── resources/ │ │ │ ├── excel/ │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── fake_source_to_ftp_excel.conf │ │ │ │ ├── fake_source_to_ftp_root_path_excel.conf │ │ │ │ ├── ftp_excel_projection_to_assert.conf │ │ │ │ ├── ftp_excel_to_assert.conf │ │ │ │ └── ftp_filter_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── e2e.json │ │ │ │ ├── fake_to_ftp_file_json.conf │ │ │ │ ├── ftp_file_json_to_assert.conf │ │ │ │ ├── ftp_file_json_to_assert_with_multipletable.conf │ │ │ │ ├── ftp_to_access_for_json_name_filter.conf │ │ │ │ └── ftp_to_access_for_json_path_filter.conf │ │ │ ├── orc/ │ │ │ │ └── fake_to_ftp_file_orc.conf │ │ │ ├── parquet/ │ │ │ │ └── fake_to_ftp_file_parquet.conf │ │ │ └── text/ │ │ │ ├── e2e.txt │ │ │ ├── fake_to_ftp_file_text.conf │ │ │ ├── fake_to_ftp_file_text_for_passive.conf │ │ │ ├── fake_to_ftp_file_text_no_verify.conf │ │ │ ├── ftp_binary_update_distcp.conf │ │ │ ├── ftp_file_text_projection_to_assert.conf │ │ │ ├── ftp_file_text_skip_headers.conf │ │ │ ├── ftp_file_text_to_assert.conf │ │ │ ├── ftp_file_text_to_assert_for_passive.conf │ │ │ ├── ftp_file_zip_text_to_assert.conf │ │ │ ├── ftp_special_characters_path_to_assert.conf │ │ │ ├── ftp_to_ftp_for_binary.conf │ │ │ ├── multiple_table_fake_to_ftp_file_text.conf │ │ │ └── multiple_table_fake_to_ftp_file_text_2.conf │ │ ├── connector-file-hadoop-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── hdfs/ │ │ │ │ ├── HdfsFileIT.java │ │ │ │ └── HdfsFileViewFsIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_hdfs_normal.conf │ │ │ ├── fake_to_hdfs_viewfs.conf │ │ │ ├── hdfs_binary_update_distcp.conf │ │ │ ├── hdfs_binary_update_strict_checksum.conf │ │ │ ├── hdfs_empty_text_to_assert.conf │ │ │ ├── hdfs_normal_to_assert.conf │ │ │ ├── hdfs_parquet_split_to_assert.conf │ │ │ ├── hdfs_text_split_to_assert.conf │ │ │ ├── hdfs_viewfs_to_assert.conf │ │ │ └── viewfs/ │ │ │ ├── cluster1/ │ │ │ │ ├── core-site.xml │ │ │ │ └── hdfs-site.xml │ │ │ ├── cluster2/ │ │ │ │ ├── core-site.xml │ │ │ │ └── hdfs-site.xml │ │ │ └── core-site.xml │ │ ├── connector-file-local-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── local/ │ │ │ │ ├── LocalFileIT.java │ │ │ │ ├── LocalFileWithMetaLakeIT.java │ │ │ │ └── LocalFileWithMultipleTableIT.java │ │ │ └── resources/ │ │ │ ├── binary/ │ │ │ │ ├── local_file_binary_to_assert.conf │ │ │ │ ├── local_file_binary_to_local_file_binary.conf │ │ │ │ ├── local_file_binary_to_local_file_binary_with_multipletable.conf │ │ │ │ ├── local_file_binary_update_distcp.conf │ │ │ │ └── local_file_binary_update_strict_checksum.conf │ │ │ ├── csv/ │ │ │ │ ├── break_line.csv │ │ │ │ ├── breakline_csv_to_assert.conf │ │ │ │ ├── csv_with_header1.csv │ │ │ │ ├── csv_with_header2.csv │ │ │ │ ├── csv_with_header_to_assert.conf │ │ │ │ ├── data/ │ │ │ │ │ ├── table1.csv │ │ │ │ │ └── table2.csv │ │ │ │ ├── fake_to_local_csv.conf │ │ │ │ ├── local_csv_enable_split_to_assert.conf │ │ │ │ ├── local_csv_to_assert.conf │ │ │ │ └── local_file_csv_to_local_file_csv_with_metalake.conf │ │ │ ├── excel/ │ │ │ │ ├── e2e.xls │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── fake_to_local_excel.conf │ │ │ │ ├── local_excel_multi_zip_to_assert.conf │ │ │ │ ├── local_excel_projection_to_assert.conf │ │ │ │ ├── local_excel_to_assert.conf │ │ │ │ ├── local_excel_to_assert_with_multipletable.conf │ │ │ │ ├── local_excel_xls_gz_to_assert.conf │ │ │ │ ├── local_excel_xlsx_gz_to_assert.conf │ │ │ │ ├── local_excel_zip_to_assert.conf │ │ │ │ ├── local_filter_excel_to_assert.conf │ │ │ │ ├── local_filter_regex_excel_to_assert.conf │ │ │ │ ├── special_excel.xlsx │ │ │ │ └── special_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── e2e.json │ │ │ │ ├── e2e_gbk.json │ │ │ │ ├── fake_to_local_file_json.conf │ │ │ │ ├── fake_to_local_file_json_save_mode.conf │ │ │ │ ├── fake_to_local_file_json_with_encoding.conf │ │ │ │ ├── local_file_json_enable_split_to_assert.conf │ │ │ │ ├── local_file_json_gz_to_assert.conf │ │ │ │ ├── local_file_json_lzo_to_console.conf │ │ │ │ ├── local_file_json_multi_zip_to_assert.conf │ │ │ │ ├── local_file_json_to_assert.conf │ │ │ │ ├── local_file_json_to_assert_with_multipletable.conf │ │ │ │ ├── local_file_json_to_console_with_encoding.conf │ │ │ │ ├── local_file_json_zip_to_assert.conf │ │ │ │ └── local_file_to_console.conf │ │ │ ├── orc/ │ │ │ │ ├── e2e.orc │ │ │ │ ├── fake_to_local_file_orc.conf │ │ │ │ ├── local_file_orc_projection_to_assert.conf │ │ │ │ ├── local_file_orc_to_assert.conf │ │ │ │ ├── local_file_orc_to_assert_with_multipletable.conf │ │ │ │ ├── local_file_orc_to_assert_with_time_and_cast.conf │ │ │ │ └── orc_for_cast.orc │ │ │ ├── parquet/ │ │ │ │ ├── e2e.parquet │ │ │ │ ├── fake_to_local_file_parquet.conf │ │ │ │ ├── local_file_parquet_enable_split_to_assert.conf │ │ │ │ ├── local_file_parquet_projection_to_assert.conf │ │ │ │ ├── local_file_parquet_to_assert.conf │ │ │ │ ├── local_file_parquet_to_assert_with_multipletable.conf │ │ │ │ └── local_file_to_console.conf │ │ │ ├── text/ │ │ │ │ ├── e2e.txt │ │ │ │ ├── e2e_delimiter.txt │ │ │ │ ├── e2e_gbk.txt │ │ │ │ ├── e2e_null_format.txt │ │ │ │ ├── e2e_time_format.txt │ │ │ │ ├── fake_to_local_file_text.conf │ │ │ │ ├── fake_to_local_file_with_encoding.conf │ │ │ │ ├── fake_to_local_file_with_multiple_table.conf │ │ │ │ ├── local_file_delimiter_assert.conf │ │ │ │ ├── local_file_gz_text_to_assert.conf │ │ │ │ ├── local_file_multi_tar_gz_text_to_assert.conf │ │ │ │ ├── local_file_multi_tar_text_to_assert.conf │ │ │ │ ├── local_file_multi_zip_text_to_assert.conf │ │ │ │ ├── local_file_null_format_assert.conf │ │ │ │ ├── local_file_tar_gz_text_to_assert.conf │ │ │ │ ├── local_file_tar_text_to_assert.conf │ │ │ │ ├── local_file_text_enable_split_to_assert.conf │ │ │ │ ├── local_file_text_lzo_to_assert.conf │ │ │ │ ├── local_file_text_projection_to_assert.conf │ │ │ │ ├── local_file_text_skip_headers.conf │ │ │ │ ├── local_file_text_to_assert.conf │ │ │ │ ├── local_file_text_to_assert_with_multipletable.conf │ │ │ │ ├── local_file_text_to_console_with_encoding.conf │ │ │ │ ├── local_file_time_format_assert.conf │ │ │ │ ├── local_file_to_local_file_with_metalake.conf │ │ │ │ └── local_file_zip_text_to_assert.conf │ │ │ └── xml/ │ │ │ ├── e2e.xml │ │ │ ├── local_file_gz_xml_to_assert.conf │ │ │ ├── local_file_xml_to_assert.conf │ │ │ └── local_file_zip_xml_to_assert.conf │ │ ├── connector-file-obs-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── obs/ │ │ │ │ └── ObsFileIT.java │ │ │ └── resources/ │ │ │ ├── csv/ │ │ │ │ ├── fake_to_obs_csv.conf │ │ │ │ ├── obs_csv_projection_to_assert.conf │ │ │ │ └── obs_csv_to_assert.conf │ │ │ ├── excel/ │ │ │ │ ├── fake_to_obs_excel.conf │ │ │ │ ├── obs_excel_projection_to_assert.conf │ │ │ │ └── obs_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── fake_to_obs_file_json.conf │ │ │ │ └── obs_file_json_to_assert.conf │ │ │ ├── orc/ │ │ │ │ ├── fake_to_obs_file_orc.conf │ │ │ │ ├── obs_file_orc_projection_to_assert.conf │ │ │ │ └── obs_file_orc_to_assert.conf │ │ │ ├── parquet/ │ │ │ │ ├── fake_to_obs_file_parquet.conf │ │ │ │ ├── obs_file_parquet_projection_to_assert.conf │ │ │ │ └── obs_file_parquet_to_assert.conf │ │ │ └── text/ │ │ │ ├── fake_to_obs_file_text.conf │ │ │ ├── obs_file_text_projection_to_assert.conf │ │ │ ├── obs_file_text_skip_headers.conf │ │ │ └── obs_file_text_to_assert.conf │ │ ├── connector-file-oss-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── oss/ │ │ │ │ ├── OssFileIT.java │ │ │ │ ├── OssFileWithMultipleTableIT.java │ │ │ │ └── OssUtils.java │ │ │ └── resources/ │ │ │ ├── excel/ │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── fake_to_oss_excel.conf │ │ │ │ ├── oss_excel_projection_to_assert.conf │ │ │ │ ├── oss_excel_to_assert.conf │ │ │ │ ├── oss_excel_to_assert_with_multipletable.conf │ │ │ │ └── oss_filter_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── e2e.json │ │ │ │ ├── e2e.json.lzo │ │ │ │ ├── fake_to_oss_file_json.conf │ │ │ │ ├── oss_file_json_lzo_to_console.conf │ │ │ │ ├── oss_file_json_to_assert.conf │ │ │ │ ├── oss_file_json_to_assert_with_multipletable.conf │ │ │ │ ├── oss_file_to_console.conf │ │ │ │ ├── oss_to_access_for_json_name_filter.conf │ │ │ │ └── oss_to_access_for_json_path_filter.conf │ │ │ ├── orc/ │ │ │ │ ├── e2e.orc │ │ │ │ ├── fake_to_oss_file_orc.conf │ │ │ │ ├── oss_file_orc_projection_to_assert.conf │ │ │ │ ├── oss_file_orc_to_assert.conf │ │ │ │ └── oss_file_orc_to_assert_with_multipletable.conf │ │ │ ├── parquet/ │ │ │ │ ├── e2e.parquet │ │ │ │ ├── fake_to_oss_file_parquet.conf │ │ │ │ ├── oss_file_parquet_projection_to_assert.conf │ │ │ │ ├── oss_file_parquet_to_assert.conf │ │ │ │ ├── oss_file_parquet_to_assert_with_multipletable.conf │ │ │ │ └── oss_file_to_console.conf │ │ │ └── text/ │ │ │ ├── e2e.txt │ │ │ ├── e2e.txt.lzo │ │ │ ├── e2e_delimiter.txt │ │ │ ├── e2e_time_format.txt │ │ │ ├── fake_to_oss_file_text.conf │ │ │ ├── fake_to_oss_file_with_multiple_table.conf │ │ │ ├── oss_file_delimiter_assert.conf │ │ │ ├── oss_file_text_lzo_to_assert.conf │ │ │ ├── oss_file_text_projection_to_assert.conf │ │ │ ├── oss_file_text_skip_headers.conf │ │ │ ├── oss_file_text_to_assert.conf │ │ │ ├── oss_file_text_to_assert_with_multipletable.conf │ │ │ ├── oss_file_time_format_assert.conf │ │ │ └── oss_file_zip_text_to_assert.conf │ │ ├── connector-file-s3-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── s3/ │ │ │ │ ├── S3FileIT.java │ │ │ │ ├── S3FileWithFilterIT.java │ │ │ │ ├── S3FileWithMultipleTableIT.java │ │ │ │ └── S3Utils.java │ │ │ └── resources/ │ │ │ ├── excel/ │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── fake_to_s3_excel.conf │ │ │ │ ├── s3_excel_projection_to_assert.conf │ │ │ │ ├── s3_excel_to_assert.conf │ │ │ │ ├── s3_excel_to_assert_with_multipletable.conf │ │ │ │ └── s3_filter_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── e2e.json │ │ │ │ ├── e2e.json.lzo │ │ │ │ ├── fake_to_s3_file_json.conf │ │ │ │ ├── s3_file_json_lzo_to_console.conf │ │ │ │ ├── s3_file_json_to_assert.conf │ │ │ │ ├── s3_file_json_to_assert_with_multipletable.conf │ │ │ │ ├── s3_file_to_console.conf │ │ │ │ ├── s3_to_access_for_json_name_filter.conf │ │ │ │ └── s3_to_access_for_json_path_filter.conf │ │ │ ├── orc/ │ │ │ │ ├── e2e.orc │ │ │ │ ├── fake_to_s3_file_orc.conf │ │ │ │ ├── s3_file_orc_projection_to_assert.conf │ │ │ │ ├── s3_file_orc_to_assert.conf │ │ │ │ └── s3_file_orc_to_assert_with_multipletable.conf │ │ │ ├── parquet/ │ │ │ │ ├── e2e.parquet │ │ │ │ ├── fake_to_s3_file_parquet.conf │ │ │ │ ├── s3_file_parquet_projection_to_assert.conf │ │ │ │ ├── s3_file_parquet_to_assert.conf │ │ │ │ ├── s3_file_parquet_to_assert_with_multipletable.conf │ │ │ │ └── s3_file_to_console.conf │ │ │ └── text/ │ │ │ ├── e2e.txt │ │ │ ├── e2e.txt.lzo │ │ │ ├── e2e_delimiter.txt │ │ │ ├── e2e_split_with_header.txt │ │ │ ├── e2e_time_format.txt │ │ │ ├── fake_to_s3_file_text.conf │ │ │ ├── fake_to_s3_file_with_multiple_table.conf │ │ │ ├── s3_file_delimiter_assert.conf │ │ │ ├── s3_file_text_enable_split_to_assert.conf │ │ │ ├── s3_file_text_lzo_to_assert.conf │ │ │ ├── s3_file_text_projection_to_assert.conf │ │ │ ├── s3_file_text_skip_headers.conf │ │ │ ├── s3_file_text_to_assert.conf │ │ │ ├── s3_file_text_to_assert_with_multipletable.conf │ │ │ ├── s3_file_time_format_assert.conf │ │ │ └── s3_file_zip_text_to_assert.conf │ │ ├── connector-file-sftp-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── file/ │ │ │ │ └── fstp/ │ │ │ │ └── SftpFileIT.java │ │ │ └── resources/ │ │ │ ├── excel/ │ │ │ │ ├── e2e.xlsx │ │ │ │ ├── fakesource_to_sftp_excel.conf │ │ │ │ ├── sftp_excel_projection_to_assert.conf │ │ │ │ ├── sftp_excel_to_assert.conf │ │ │ │ └── sftp_filter_excel_to_assert.conf │ │ │ ├── json/ │ │ │ │ ├── e2e.json │ │ │ │ ├── fake_to_sftp_file_json.conf │ │ │ │ ├── sftp_file_json_to_assert.conf │ │ │ │ ├── sftp_file_json_to_assert_with_multipletable.conf │ │ │ │ ├── sftp_to_access_for_json_name_filter.conf │ │ │ │ └── sftp_to_access_for_json_path_filter.conf │ │ │ ├── text/ │ │ │ │ ├── e2e.txt │ │ │ │ ├── fake_to_sftp_file_text.conf │ │ │ │ ├── multiple_fake_to_sftp_file_text_append.conf │ │ │ │ ├── multiple_fake_to_sftp_file_text_recreate_schema.conf │ │ │ │ ├── sftp_binary_update_distcp.conf │ │ │ │ ├── sftp_file_text_projection_to_assert.conf │ │ │ │ ├── sftp_file_text_skip_headers.conf │ │ │ │ ├── sftp_file_text_to_assert.conf │ │ │ │ ├── sftp_file_text_wildcard_character_to_assert.conf │ │ │ │ └── sftp_file_zip_text_to_assert.conf │ │ │ └── xml/ │ │ │ ├── e2e.xml │ │ │ ├── fake_to_sftp_file_xml.conf │ │ │ └── sftp_file_xml_to_assert.conf │ │ ├── connector-fluss-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── fluss/ │ │ │ │ └── FlussSinkIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_fluss.conf │ │ │ └── fake_to_multipletable_fluss.conf │ │ ├── connector-google-firestore-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org.apache.seatunnel.e2e.connector.google.firestore/ │ │ │ │ └── GoogleFirestoreIT.java │ │ │ └── resources/ │ │ │ └── firestore/ │ │ │ └── fake_to_google_firestore.conf │ │ ├── connector-graphql-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── graphql/ │ │ │ │ └── GraphQLIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ └── pg.sql │ │ │ ├── fake_to_graphql.conf │ │ │ └── graphql_to_assert.conf │ │ ├── connector-hbase-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── hbase/ │ │ │ │ ├── HbaseCluster.java │ │ │ │ └── HbaseIT.java │ │ │ └── resources/ │ │ │ ├── fake-to-assign-cf-hbase.conf │ │ │ ├── fake-to-hbase-array.conf │ │ │ ├── fake-to-hbase-binary-rowkey.conf │ │ │ ├── fake-to-hbase-with-date-time-decimal.conf │ │ │ ├── fake-to-hbase-with-multipletable.conf │ │ │ ├── fake-to-hbase.conf │ │ │ ├── fake_to_hbase_with_append_data.conf │ │ │ ├── fake_to_hbase_with_create_when_not_exists.conf │ │ │ ├── fake_to_hbase_with_drop_data.conf │ │ │ ├── fake_to_hbase_with_error_when_data_exists.conf │ │ │ ├── fake_to_hbase_with_error_when_not_exists.conf │ │ │ ├── fake_to_hbase_with_recreate_schema.conf │ │ │ ├── hbase-source-to-assert-with-batch-query.conf │ │ │ ├── hbase-source-with-default-inclusive.conf │ │ │ ├── hbase-source-with-end-rowkey.conf │ │ │ ├── hbase-source-with-namespace.conf │ │ │ ├── hbase-source-with-rowkey-range.conf │ │ │ ├── hbase-source-with-start-end-inclusive.conf │ │ │ ├── hbase-source-with-start-rowkey.conf │ │ │ ├── hbase-source-with-time-range.conf │ │ │ ├── hbase-to-assert-with-date-time-decimal.conf │ │ │ ├── hbase-to-assert-with-multipletable.conf │ │ │ └── hbase-to-assert.conf │ │ ├── connector-hive-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── hive/ │ │ │ │ ├── HiveContainer.java │ │ │ │ ├── HiveIT.java │ │ │ │ ├── HiveKerberosIT.java │ │ │ │ └── HiveOverwriteIT.java │ │ │ └── resources/ │ │ │ ├── auto_table_creation/ │ │ │ │ ├── fake_to_hive_all_types.conf │ │ │ │ ├── fake_to_hive_create_when_not_exist.conf │ │ │ │ ├── fake_to_hive_custom_template.conf │ │ │ │ ├── fake_to_hive_default_template.conf │ │ │ │ ├── fake_to_hive_recreate_schema.conf │ │ │ │ ├── hive_auto_create_default_to_assert.conf │ │ │ │ ├── hive_auto_create_to_assert.conf │ │ │ │ ├── hive_auto_orc_format_to_assert.conf │ │ │ │ └── hive_auto_recreate_to_assert.conf │ │ │ ├── fake_to_hive.conf │ │ │ ├── fake_to_hive_metastore_uri_failover.conf │ │ │ ├── fake_to_hive_on_cos.conf │ │ │ ├── fake_to_hive_on_oss.conf │ │ │ ├── fake_to_hive_on_s3.conf │ │ │ ├── fake_to_hive_with_kerberos.conf │ │ │ ├── hive_empty_orc_to_assert.conf │ │ │ ├── hive_empty_parquet_to_hive.conf │ │ │ ├── hive_empty_text_to_assert.conf │ │ │ ├── hive_on_cos_to_assert.conf │ │ │ ├── hive_on_oss_to_assert.conf │ │ │ ├── hive_on_s3_to_assert.conf │ │ │ ├── hive_to_assert.conf │ │ │ ├── hive_to_assert_metastore_uri_failover.conf │ │ │ ├── hive_to_assert_with_kerberos.conf │ │ │ ├── kerberos/ │ │ │ │ ├── core-site.xml │ │ │ │ ├── hive-site.xml │ │ │ │ ├── krb5.conf │ │ │ │ └── krb5_local.conf │ │ │ ├── overwrite/ │ │ │ │ ├── fake_to_hive_overwrite_1.conf │ │ │ │ ├── fake_to_hive_overwrite_2.conf │ │ │ │ ├── fake_to_hive_overwrite_3.conf │ │ │ │ ├── hive_to_assert_overwrite_1.conf │ │ │ │ ├── hive_to_assert_overwrite_2.conf │ │ │ │ └── hive_to_assert_overwrite_3.conf │ │ │ └── regex/ │ │ │ ├── fake_to_hive_regex_1.conf │ │ │ ├── fake_to_hive_regex_2.conf │ │ │ ├── fake_to_hive_regex_ignore.conf │ │ │ ├── fake_to_hive_regex_no_match.conf │ │ │ ├── fake_to_hive_regex_other.conf │ │ │ ├── hive_regex_db_to_assert.conf │ │ │ ├── hive_regex_db_to_assert_root.conf │ │ │ ├── hive_regex_table_pattern_to_assert.conf │ │ │ └── hive_regex_table_prefix_to_assert.conf │ │ ├── connector-http-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── http/ │ │ │ │ └── HttpIT.java │ │ │ └── resources/ │ │ │ ├── airtable_json_to_assert.conf │ │ │ ├── fake_to_airtable.conf │ │ │ ├── fake_to_multitable.conf │ │ │ ├── github_json_to_assert.conf │ │ │ ├── gitlab_json_to_assert.conf │ │ │ ├── http_contentjson_to_assert.conf │ │ │ ├── http_formrequestbody_to_assert.conf │ │ │ ├── http_formrequestbody_to_assert2.conf │ │ │ ├── http_json_to_assert.conf │ │ │ ├── http_jsonpath_to_assert.conf │ │ │ ├── http_jsonrequestbody_to_assert.conf │ │ │ ├── http_jsonrequestbody_to_feishu.conf │ │ │ ├── http_multilinejson_to_assert.conf │ │ │ ├── http_page_cursor_num_assert.conf │ │ │ ├── http_page_increase_no_page_num.conf │ │ │ ├── http_page_increase_page_num.conf │ │ │ ├── http_page_increase_start_num.conf │ │ │ ├── http_post_param_json_to_assert.conf │ │ │ ├── http_streaming_json_to_postgresql.conf │ │ │ ├── httpnoschema_to_http.conf │ │ │ ├── jira_json_to_assert.conf │ │ │ ├── klaviyo_json_to_assert.conf │ │ │ ├── lemlist_json_to_assert.conf │ │ │ ├── mockserver-config.json │ │ │ ├── notion_json_to_assert.conf │ │ │ ├── onesignal_json_to_assert.conf │ │ │ └── persistiq_json_to_assert.conf │ │ ├── connector-hudi-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── hudi/ │ │ │ │ ├── HudiIT.java │ │ │ │ ├── HudiMultiTableIT.java │ │ │ │ ├── HudiSeatunnelS3MultiTableIT.java │ │ │ │ ├── HudiSinkCDCIT.java │ │ │ │ ├── HudiSparkS3MultiTableIT.java │ │ │ │ └── MinIoUtils.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ └── mysql_cdc.sql │ │ │ ├── hudi/ │ │ │ │ ├── core-site.xml │ │ │ │ ├── fake_to_hudi.conf │ │ │ │ ├── fake_to_hudi_with_omit_config_item.conf │ │ │ │ ├── multi_fake_to_hudi.conf │ │ │ │ ├── mysql_cdc_to_hudi.conf │ │ │ │ └── s3_fake_to_hudi.conf │ │ │ └── mysql/ │ │ │ ├── server-gtids/ │ │ │ │ └── my.cnf │ │ │ └── setup.sql │ │ ├── connector-hugegraph-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── e2e/ │ │ │ └── connector/ │ │ │ └── hugegraph/ │ │ │ └── HugeGraphIT.java │ │ ├── connector-iceberg-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── iceberg/ │ │ │ │ ├── IcebergSinkCDCIT.java │ │ │ │ ├── IcebergSinkIT.java │ │ │ │ ├── IcebergSinkWithBranchIT.java │ │ │ │ └── IcebergSourceIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── inventory.sql │ │ │ │ └── mysql_cdc.sql │ │ │ ├── iceberg/ │ │ │ │ ├── fake_to_iceberg.conf │ │ │ │ ├── fake_to_iceberg_with_branch.conf │ │ │ │ ├── fake_to_iceberg_with_partition_keys_placeholder.conf │ │ │ │ ├── fake_to_orc_iceberg.conf │ │ │ │ ├── filter_iceberg_source.conf │ │ │ │ ├── filter_iceberg_source_tables.conf │ │ │ │ ├── iceberg_source.conf │ │ │ │ ├── mysql_cdc_to_iceberg.conf │ │ │ │ └── mysql_cdc_to_iceberg_for_schema_change.conf │ │ │ └── mysql/ │ │ │ ├── server-gtids/ │ │ │ │ └── my.cnf │ │ │ └── setup.sql │ │ ├── connector-iceberg-hadoop3-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── iceberg/ │ │ │ │ └── hadoop3/ │ │ │ │ └── IcebergSourceIT.java │ │ │ └── resources/ │ │ │ └── iceberg/ │ │ │ └── iceberg_source.conf │ │ ├── connector-iceberg-s3-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── iceberg/ │ │ │ │ └── s3/ │ │ │ │ └── IcebergSourceIT.java │ │ │ └── resources/ │ │ │ └── iceberg/ │ │ │ └── iceberg_source.conf │ │ ├── connector-influxdb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── influxdb/ │ │ │ │ └── InfluxdbIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_infuxdb_with_multipletable.conf │ │ │ ├── influxdb-to-influxdb-with-tz.conf │ │ │ └── influxdb-to-influxdb.conf │ │ ├── connector-iotdb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── iotdb/ │ │ │ │ └── IoTDBIT.java │ │ │ └── resources/ │ │ │ └── iotdb/ │ │ │ └── iotdb_source_to_sink.conf │ │ ├── connector-iotdb-v2-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── iotdb/ │ │ │ │ ├── IoTDBIT.java │ │ │ │ └── IoTDBRelationalIT.java │ │ │ └── resources/ │ │ │ └── iotdb/ │ │ │ ├── iotdb_source_to_sink.conf │ │ │ └── iotdb_source_to_sink_table.conf │ │ ├── connector-jdbc-e2e/ │ │ │ ├── connector-jdbc-e2e-common/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── jdbc/ │ │ │ │ ├── AbstractJdbcIT.java │ │ │ │ ├── InsecureURLClassLoader.java │ │ │ │ ├── JdbcCase.java │ │ │ │ └── JdbcITErrorCode.java │ │ │ ├── connector-jdbc-e2e-ddl/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── AbstractSchemaChangeBaseIT.java │ │ │ │ │ ├── DmSchemaChangeIT.java │ │ │ │ │ ├── PostgresSchemaChangeIT.java │ │ │ │ │ ├── SchemaChangeCase.java │ │ │ │ │ └── SqlServerSchemaChangeIT.java │ │ │ │ └── resources/ │ │ │ │ ├── ddl/ │ │ │ │ │ ├── add_columns.sql │ │ │ │ │ ├── change_columns.sql │ │ │ │ │ ├── drop_columns.sql │ │ │ │ │ ├── inventory.sql │ │ │ │ │ ├── modify_columns.sql │ │ │ │ │ └── shop.sql │ │ │ │ ├── docker/ │ │ │ │ │ ├── server-gtids/ │ │ │ │ │ │ └── my.cnf │ │ │ │ │ └── setup.sql │ │ │ │ ├── mysqlcdc_to_dm_with_schema_change.conf │ │ │ │ ├── mysqlcdc_to_dm_with_schema_change_exactly_once.conf │ │ │ │ ├── mysqlcdc_to_postgres_with_schema_change.conf │ │ │ │ ├── mysqlcdc_to_postgres_with_schema_change_exactly_once.conf │ │ │ │ ├── mysqlcdc_to_sqlserver_with_schema_change.conf │ │ │ │ └── mysqlcdc_to_sqlserver_with_schema_change_exactly_once.conf │ │ │ ├── connector-jdbc-e2e-part-1/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcAutoGenerateSQLIT.java │ │ │ │ │ ├── JdbcDb2IT.java │ │ │ │ │ ├── JdbcDb2UpsertIT.java │ │ │ │ │ ├── JdbcMariaDBIT.java │ │ │ │ │ ├── JdbcMysqlIT.java │ │ │ │ │ ├── JdbcMysqlMultipleTablesIT.java │ │ │ │ │ ├── JdbcOracleIT.java │ │ │ │ │ ├── JdbcOracleMultipleTablesIT.java │ │ │ │ │ ├── JdbcPostgresIdentifierIT.java │ │ │ │ │ ├── JdbcSinkNameParameterSQLIT.java │ │ │ │ │ └── internal/ │ │ │ │ │ └── xa/ │ │ │ │ │ └── XaGroupOpsImplIT.java │ │ │ │ └── resources/ │ │ │ │ ├── jdbc_db2_source_and_sink.conf │ │ │ │ ├── jdbc_db2_source_and_sink_upsert.conf │ │ │ │ ├── jdbc_mariadb_source_and_sink.conf │ │ │ │ ├── jdbc_mariadb_source_using_table_path.conf │ │ │ │ ├── jdbc_mysql_source_and_sink.conf │ │ │ │ ├── jdbc_mysql_source_and_sink.sql │ │ │ │ ├── jdbc_mysql_source_and_sink_parallel.conf │ │ │ │ ├── jdbc_mysql_source_and_sink_parallel.sql │ │ │ │ ├── jdbc_mysql_source_and_sink_parallel_upper_lower.conf │ │ │ │ ├── jdbc_mysql_source_and_sink_with_multiple_tables.conf │ │ │ │ ├── jdbc_mysql_source_and_sink_with_multiple_tables.sql │ │ │ │ ├── jdbc_mysql_source_and_sink_with_pattern_tables.conf │ │ │ │ ├── jdbc_mysql_source_and_sink_xa.conf │ │ │ │ ├── jdbc_mysql_source_using_table_path.conf │ │ │ │ ├── jdbc_oracle_fake_source_to_sink_with_lob.conf │ │ │ │ ├── jdbc_oracle_source_to_sink.conf │ │ │ │ ├── jdbc_oracle_source_to_sink_use_select1.conf │ │ │ │ ├── jdbc_oracle_source_to_sink_use_select2.conf │ │ │ │ ├── jdbc_oracle_source_to_sink_use_select3.conf │ │ │ │ ├── jdbc_oracle_source_to_sink_with_blob_as_string.conf │ │ │ │ ├── jdbc_oracle_source_to_sink_without_decimal_type_narrowing.conf │ │ │ │ ├── jdbc_oracle_source_with_multiple_tables_to_sink.conf │ │ │ │ ├── jdbc_oracle_source_with_pattern_tables_to_sink.conf │ │ │ │ ├── jdbc_postgres_ide_source_and_sink.conf │ │ │ │ ├── jdbc_sink_auto_generate_sql.conf │ │ │ │ ├── jdbc_sink_auto_generate_upsql_sql.conf │ │ │ │ ├── jdbc_sink_name_parameter_sql.conf │ │ │ │ └── sql/ │ │ │ │ └── oracle_init.sql │ │ │ ├── connector-jdbc-e2e-part-2/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcOceanBaseITBase.java │ │ │ │ │ ├── JdbcOceanBaseMilvusIT.java │ │ │ │ │ ├── JdbcOceanBaseMysqlIT.java │ │ │ │ │ ├── JdbcOceanBaseOracleIT.java │ │ │ │ │ ├── JdbcPhoenixIT.java │ │ │ │ │ ├── JdbcSelectDBCloudIT.java │ │ │ │ │ ├── JdbcStarRocksdbIT.java │ │ │ │ │ └── JdbcTeradataIT.java │ │ │ │ └── resources/ │ │ │ │ ├── jdbc_fake_to_oceanbase_sink.conf │ │ │ │ ├── jdbc_milvus_source_and_oceanbase_sink.conf │ │ │ │ ├── jdbc_oceanbase_mysql_source_and_sink.conf │ │ │ │ ├── jdbc_oceanbase_oracle_source_and_sink.conf │ │ │ │ ├── jdbc_oceanbase_source_and_milvus_sink.conf │ │ │ │ ├── jdbc_phoenix_source_and_sink.conf │ │ │ │ ├── jdbc_starrocks_dialect.conf │ │ │ │ ├── jdbc_starrocks_source_to_sink.conf │ │ │ │ ├── jdbc_teradata_source_and_sink.conf │ │ │ │ ├── junit-platform.properties │ │ │ │ └── selectdb-jdbc-to-selectdb.conf │ │ │ ├── connector-jdbc-e2e-part-3/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcHiveIT.java │ │ │ │ │ ├── JdbcKingbaseIT.java │ │ │ │ │ ├── JdbcPostgresIT.java │ │ │ │ │ ├── JdbcSinkCDCChangelogIT.java │ │ │ │ │ ├── JdbcSnowflakeIT.java │ │ │ │ │ ├── JdbcSqlServerIT.java │ │ │ │ │ └── JdbcVerticaIT.java │ │ │ │ └── resources/ │ │ │ │ ├── jdbc_hive_source_and_assert.conf │ │ │ │ ├── jdbc_kingbase_source_and_sink.conf │ │ │ │ ├── jdbc_postgres_source_and_sink.conf │ │ │ │ ├── jdbc_postgres_source_and_sink_copy_stmt.conf │ │ │ │ ├── jdbc_postgres_source_and_sink_parallel.conf │ │ │ │ ├── jdbc_postgres_source_and_sink_parallel_upper_lower.conf │ │ │ │ ├── jdbc_postgres_source_and_sink_xa.conf │ │ │ │ ├── jdbc_sink_cdc_changelog.conf │ │ │ │ ├── jdbc_snowflake_source_and_sink.conf │ │ │ │ ├── jdbc_sqlserver_source_to_sink.conf │ │ │ │ └── jdbc_vertica_source_and_sink.conf │ │ │ ├── connector-jdbc-e2e-part-4/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── connectors/ │ │ │ │ └── seatunnel/ │ │ │ │ └── jdbc/ │ │ │ │ ├── JdbcMySqlCreateTableIT.java │ │ │ │ └── JdbcSqlServerCreateTableIT.java │ │ │ ├── connector-jdbc-e2e-part-5/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcCloudberryIT.java │ │ │ │ │ ├── JdbcDmIT.java │ │ │ │ │ ├── JdbcDmSaveModeIT.java │ │ │ │ │ ├── JdbcDmUpsetIT.java │ │ │ │ │ ├── JdbcDorisIT.java │ │ │ │ │ ├── JdbcDorisdbIT.java │ │ │ │ │ ├── JdbcGBase8aIT.java │ │ │ │ │ └── JdbcGreenplumIT.java │ │ │ │ └── resources/ │ │ │ │ ├── doris-jdbc-to-doris.conf │ │ │ │ ├── jdbc_cloudberry_source_and_sink.conf │ │ │ │ ├── jdbc_dm_source_and_dm_upset_sink.conf │ │ │ │ ├── jdbc_dm_source_and_sink.conf │ │ │ │ ├── jdbc_dm_source_and_sink_savemode.conf │ │ │ │ ├── jdbc_doris_source_and_sink.conf │ │ │ │ ├── jdbc_gbase8a_source_to_assert.conf │ │ │ │ └── jdbc_greenplum_source_and_sink.conf │ │ │ ├── connector-jdbc-e2e-part-6/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcHanaIT.java │ │ │ │ │ └── JdbcOracleLowercaseTableIT.java │ │ │ │ └── resources/ │ │ │ │ ├── jdbc_sap_hana_source_and_sink.conf │ │ │ │ ├── jdbc_sap_hana_test_view_and_synonym.conf │ │ │ │ └── sql/ │ │ │ │ └── oracle_init.sql │ │ │ ├── connector-jdbc-e2e-part-7/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── connectors/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── jdbc/ │ │ │ │ │ ├── JdbcErrorIT.java │ │ │ │ │ ├── JdbcHighGoIT.java │ │ │ │ │ ├── JdbcIrisIT.java │ │ │ │ │ ├── JdbcMySqlSaveModeCatalogIT.java │ │ │ │ │ ├── JdbcMysqlSaveModeHandlerIT.java │ │ │ │ │ ├── JdbcMysqlSplitIT.java │ │ │ │ │ ├── JdbcOpenGaussIT.java │ │ │ │ │ ├── JdbcPrestoIT.java │ │ │ │ │ ├── JdbcTrinoIT.java │ │ │ │ │ ├── JdbcXuguIT.java │ │ │ │ │ └── MetalakeIT.java │ │ │ │ └── resources/ │ │ │ │ ├── jdbc_highgo_source_and_sink_with_full_type.conf │ │ │ │ ├── jdbc_iris_source_to_sink_with_full_type.conf │ │ │ │ ├── jdbc_iris_upsert.conf │ │ │ │ ├── jdbc_mysql_source_and_sink.conf │ │ │ │ ├── jdbc_mysql_source_to_assert_sink_with_metalake.conf │ │ │ │ ├── jdbc_opengauss_source_and_sink.conf │ │ │ │ ├── jdbc_presto_source_and_assert.conf │ │ │ │ ├── jdbc_trino_source_and_assert.conf │ │ │ │ ├── jdbc_xugu_source_and_sink.conf │ │ │ │ ├── jdbc_xugu_source_and_upsert_sink.conf │ │ │ │ └── password/ │ │ │ │ └── password.txt │ │ │ └── pom.xml │ │ ├── connector-kafka-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── kafka/ │ │ │ │ ├── KafkaFormatIT.java │ │ │ │ ├── KafkaIT.java │ │ │ │ └── KafkaKerberosIT.java │ │ │ └── resources/ │ │ │ ├── avro/ │ │ │ │ ├── fake_source_to_kafka_avro_format.conf │ │ │ │ └── kafka_avro_to_assert.conf │ │ │ ├── canal/ │ │ │ │ └── canal_data.txt │ │ │ ├── canalFormatIT/ │ │ │ │ ├── kafka_source_canal_cdc_to_pgsql.conf │ │ │ │ └── kafka_source_canal_to_kafka.conf │ │ │ ├── compatible/ │ │ │ │ └── compatible_data.txt │ │ │ ├── compatibleFormatIT/ │ │ │ │ └── kafkasource_jdbc_record_to_pgsql.conf │ │ │ ├── debezium/ │ │ │ │ └── debezium_data.txt │ │ │ ├── debeziumFormatIT/ │ │ │ │ ├── kafkasource_debezium_cdc_to_pgsql.conf │ │ │ │ └── kafkasource_debezium_to_kafka.conf │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ ├── extractTopic_fake_to_kafka.conf │ │ │ ├── jsonFormatIT/ │ │ │ │ └── kafka_source_json_to_console.conf │ │ │ ├── kafka/ │ │ │ │ ├── kafka_dynamic_partition_discovery.conf │ │ │ │ ├── kafka_source_to_assert_with_max_poll_records_1.conf │ │ │ │ ├── kafka_to_kafka_exactly_once_batch.conf │ │ │ │ ├── kafka_to_kafka_exactly_once_streaming.conf │ │ │ │ ├── kafkasource_earliest_to_console.conf │ │ │ │ ├── kafkasource_endTimestamp_to_console.conf │ │ │ │ ├── kafkasource_format_error_handle_way_fail_to_console.conf │ │ │ │ ├── kafkasource_format_error_handle_way_skip_to_console.conf │ │ │ │ ├── kafkasource_group_offset_to_console.conf │ │ │ │ ├── kafkasource_group_offset_to_console_with_commit_offset.conf │ │ │ │ ├── kafkasource_latest_to_console.conf │ │ │ │ ├── kafkasource_restore_with_earliest_mode.conf │ │ │ │ ├── kafkasource_restore_with_latest_mode.conf │ │ │ │ ├── kafkasource_restore_with_specific_offsets_mode.conf │ │ │ │ ├── kafkasource_restore_with_timestamp_mode.conf │ │ │ │ ├── kafkasource_specific_offsets_to_console.conf │ │ │ │ ├── kafkasource_timestamp_to_console.conf │ │ │ │ └── kafkasource_timestamp_to_console_skip_partition.conf │ │ │ ├── kafka_default_sink_fake_to_kafka.conf │ │ │ ├── kafka_native_to_kafka.conf │ │ │ ├── kafka_sink_fake_to_kafka.conf │ │ │ ├── kafka_sink_with_headers.conf │ │ │ ├── kerberos/ │ │ │ │ ├── kafka.properties │ │ │ │ ├── kafka_server_jaas.conf │ │ │ │ ├── kafka_sink_fake_to_kafka_kerberos.conf │ │ │ │ ├── kafka_sink_with_not_kerberos.conf │ │ │ │ ├── kafka_source_to_assert_with_kerberos.conf │ │ │ │ ├── krb5.conf │ │ │ │ ├── krb5_local.conf │ │ │ │ └── start.sh │ │ │ ├── maxwell/ │ │ │ │ └── maxwell_data.txt │ │ │ ├── maxwellFormatIT/ │ │ │ │ ├── kafkasource_maxwell_cdc_to_pgsql.conf │ │ │ │ └── kafkasource_maxwell_to_kafka.conf │ │ │ ├── multiFormatIT/ │ │ │ │ └── kafka_multi_source_to_pg.conf │ │ │ ├── ogg/ │ │ │ │ └── ogg_data.txt │ │ │ ├── oggFormatIT/ │ │ │ │ ├── kafka_source_ogg_to_kafka.conf │ │ │ │ └── kafka_source_ogg_to_pgsql.conf │ │ │ ├── protobuf/ │ │ │ │ ├── fake_to_kafka_protobuf.conf │ │ │ │ ├── kafka_protobuf_schema_registry_header_transform_to_assert.conf │ │ │ │ ├── kafka_protobuf_to_assert.conf │ │ │ │ └── kafka_protobuf_transform_to_assert.conf │ │ │ └── textFormatIT/ │ │ │ ├── fake_source_to_text_sink_kafka.conf │ │ │ ├── kafka_source_text_to_console.conf │ │ │ ├── kafka_source_text_to_console_assert_catalog_table.conf │ │ │ ├── kafka_source_text_with_event_time_to_assert.conf │ │ │ ├── kafka_source_text_with_no_schema.conf │ │ │ └── kafka_source_topic_multiple_point_text_to_console.conf │ │ ├── connector-kudu-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── kudu/ │ │ │ │ └── KuduIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_kudu_with_multipletable.conf │ │ │ ├── kudu_to_assert.conf │ │ │ ├── kudu_to_assert_equal.conf │ │ │ ├── kudu_to_assert_range.conf │ │ │ ├── kudu_to_assert_with_all_tables.conf │ │ │ ├── kudu_to_assert_with_multipletable.conf │ │ │ ├── kudu_to_assert_with_pattern_tables.conf │ │ │ ├── kudu_to_assert_with_table_list_pattern.conf │ │ │ ├── kudu_to_console.conf │ │ │ └── write-cdc-changelog-to-kudu.conf │ │ ├── connector-lance-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── lance/ │ │ │ │ └── LanceIT.java │ │ │ └── resources/ │ │ │ └── lance/ │ │ │ └── fake_to_lance.conf │ │ ├── connector-maxcompute-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── maxcompute/ │ │ │ │ └── MaxComputeIT.java │ │ │ └── resources/ │ │ │ ├── fake_maxcompute_delete.conf │ │ │ ├── fake_maxcompute_upsert.conf │ │ │ ├── fake_to_maxcompute_no_pk.conf │ │ │ ├── maxcompute_to_maxcompute.conf │ │ │ └── maxcompute_to_maxcompute_multi_table.conf │ │ ├── connector-milvus-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── v2/ │ │ │ │ └── milvus/ │ │ │ │ └── MilvusIT.java │ │ │ └── resources/ │ │ │ ├── fake-to-milvus.conf │ │ │ ├── milvus-to-milvus-index-preservation.conf │ │ │ ├── milvus-to-milvus-with-partitionkey.conf │ │ │ ├── milvus-to-milvus-with-partitions.conf │ │ │ ├── milvus-to-milvus.conf │ │ │ ├── multi-fake-to-milvus.conf │ │ │ └── streaming-fake-to-milvus.conf │ │ ├── connector-mongodb-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── v2/ │ │ │ │ └── mongodb/ │ │ │ │ ├── AbstractMongodbIT.java │ │ │ │ ├── MongodbCDCIT.java │ │ │ │ └── MongodbIT.java │ │ │ └── resources/ │ │ │ ├── cdcIT/ │ │ │ │ ├── fake_cdc_sink_mongodb.conf │ │ │ │ └── fake_cdc_upsert_sink_mongodb.conf │ │ │ ├── compatibleParametersIT/ │ │ │ │ ├── fake_source_to_update_mongodb.conf │ │ │ │ └── mongodb_matchQuery_source_to_assert.conf │ │ │ ├── fake_source_to_mongodb.conf │ │ │ ├── fake_source_to_mongodb_multiple_table.conf │ │ │ ├── flatIT/ │ │ │ │ ├── fake_source_to_flat_mongodb.conf │ │ │ │ └── mongodb_flat_source_to_assert.conf │ │ │ ├── matchIT/ │ │ │ │ ├── mongodb_matchProjection_source_to_assert.conf │ │ │ │ └── mongodb_matchQuery_source_to_assert.conf │ │ │ ├── mongodb_double_value.conf │ │ │ ├── mongodb_null_value.conf │ │ │ ├── mongodb_source_to_assert.conf │ │ │ ├── splitIT/ │ │ │ │ ├── mongodb_split_key_source_to_assert.conf │ │ │ │ └── mongodb_split_size_source_to_assert.conf │ │ │ ├── transactionIT/ │ │ │ │ ├── fake_source_to_transaction_upsert_mongodb.conf │ │ │ │ ├── mongodb_source_transaction_sink_to_assert.conf │ │ │ │ └── mongodb_source_transaction_upsert_to_assert.conf │ │ │ └── updateIT/ │ │ │ ├── fake_source_to_updateMode_insert_mongodb.conf │ │ │ ├── fake_source_to_update_mongodb.conf │ │ │ └── update_mongodb_to_assert.conf │ │ ├── connector-neo4j-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── neo4j/ │ │ │ │ └── Neo4jIT.java │ │ │ └── resources/ │ │ │ └── neo4j/ │ │ │ ├── fake_to_neo4j_batch_write.conf │ │ │ └── neo4j_to_neo4j.conf │ │ ├── connector-paimon-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── paimon/ │ │ │ │ ├── AbstractPaimonIT.java │ │ │ │ ├── PaimonDynamicOptionsIT.java │ │ │ │ ├── PaimonIT.java │ │ │ │ ├── PaimonRecord.java │ │ │ │ ├── PaimonRecordWithFullType.java │ │ │ │ ├── PaimonSinkCDCIT.java │ │ │ │ ├── PaimonSinkDynamicBucketIT.java │ │ │ │ ├── PaimonSinkHdfsIT.java │ │ │ │ ├── PaimonSinkWithSchemaEvolutionIT.java │ │ │ │ ├── PaimonStreamReadIT.java │ │ │ │ ├── PaimonWithS3IT.java │ │ │ │ └── SimpleBucketIndex.java │ │ │ └── resources/ │ │ │ ├── changelog_fake_cdc_sink_paimon_case1_ddl.conf │ │ │ ├── changelog_fake_cdc_sink_paimon_case1_insert_data.conf │ │ │ ├── changelog_fake_cdc_sink_paimon_case1_update_data.conf │ │ │ ├── changelog_fake_cdc_sink_paimon_case2.conf │ │ │ ├── changelog_paimon_to_paimon.conf │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ ├── bucket.sql │ │ │ │ ├── change_columns.sql │ │ │ │ ├── drop_columns.sql │ │ │ │ ├── inventory.sql │ │ │ │ ├── modify_columns.sql │ │ │ │ ├── mysql_cdc.sql │ │ │ │ └── shop.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ ├── fake_2_paimon_with_s3_to_assert.conf │ │ │ ├── fake_cdc_sink_paimon_case1.conf │ │ │ ├── fake_cdc_sink_paimon_case10.conf │ │ │ ├── fake_cdc_sink_paimon_case1_with_error_schema.conf │ │ │ ├── fake_cdc_sink_paimon_case2.conf │ │ │ ├── fake_cdc_sink_paimon_case3.conf │ │ │ ├── fake_cdc_sink_paimon_case4.conf │ │ │ ├── fake_cdc_sink_paimon_case5.conf │ │ │ ├── fake_cdc_sink_paimon_case6.conf │ │ │ ├── fake_cdc_sink_paimon_case7.conf │ │ │ ├── fake_cdc_sink_paimon_case8.conf │ │ │ ├── fake_cdc_sink_paimon_case9.conf │ │ │ ├── fake_cdc_sink_paimon_with_hdfs_ha.conf │ │ │ ├── fake_cdc_sink_paimon_with_hdfs_with_hive_catalog.conf │ │ │ ├── fake_cdc_to_dynamic_bucket_paimon_case.conf │ │ │ ├── fake_sink_paimon_truncate_with_hdfs_case1.conf │ │ │ ├── fake_sink_paimon_truncate_with_hdfs_case2.conf │ │ │ ├── fake_sink_paimon_truncate_with_hive_case1.conf │ │ │ ├── fake_sink_paimon_truncate_with_hive_case2.conf │ │ │ ├── fake_sink_paimon_truncate_with_local_case1.conf │ │ │ ├── fake_sink_paimon_truncate_with_local_case2.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case1.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case2.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case3.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case4.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case5.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case6.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case7.conf │ │ │ ├── fake_to_dynamic_bucket_paimon_case8.conf │ │ │ ├── fake_to_paimon.conf │ │ │ ├── fake_to_paimon_2.conf │ │ │ ├── fake_to_paimon_branch.conf │ │ │ ├── fake_to_paimon_privilege.conf │ │ │ ├── fake_to_paimon_privilege1.conf │ │ │ ├── fake_to_paimon_with_change_log_tmp.conf │ │ │ ├── fake_to_paimon_with_full_type.conf │ │ │ ├── fake_to_paimon_with_full_type_cdc_data.conf │ │ │ ├── fake_to_paimon_with_s3.conf │ │ │ ├── fake_to_paimon_with_s3_with_checkpoint.conf │ │ │ ├── fake_to_paimon_with_s3_with_privilege.conf │ │ │ ├── mysql_cdc_to_paimon_with_schema_change.conf │ │ │ ├── mysql_jdbc_to_dynamic_bucket_paimon_case1.conf │ │ │ ├── mysql_jdbc_to_dynamic_bucket_paimon_case2.conf │ │ │ ├── mysql_jdbc_to_dynamic_bucket_paimon_case3.conf │ │ │ ├── paimon-to-assert-with-multipletable.conf │ │ │ ├── paimon_projection_to_assert.conf │ │ │ ├── paimon_to_assert.conf │ │ │ ├── paimon_to_assert_with_dynamic_options_of_branch.conf │ │ │ ├── paimon_to_assert_with_dynamic_options_of_incr_tag.conf │ │ │ ├── paimon_to_assert_with_dynamic_options_of_tag1.conf │ │ │ ├── paimon_to_assert_with_dynamic_options_of_tag2.conf │ │ │ ├── paimon_to_assert_with_filter1.conf │ │ │ ├── paimon_to_assert_with_filter10.conf │ │ │ ├── paimon_to_assert_with_filter2.conf │ │ │ ├── paimon_to_assert_with_filter3.conf │ │ │ ├── paimon_to_assert_with_filter4.conf │ │ │ ├── paimon_to_assert_with_filter5.conf │ │ │ ├── paimon_to_assert_with_filter6.conf │ │ │ ├── paimon_to_assert_with_filter7.conf │ │ │ ├── paimon_to_assert_with_filter8.conf │ │ │ ├── paimon_to_assert_with_filter9.conf │ │ │ ├── paimon_to_assert_with_hivecatalog.conf │ │ │ ├── paimon_to_assert_with_timestampN.conf │ │ │ ├── paimon_to_paimon.conf │ │ │ ├── paimon_to_paimon_privilege.conf │ │ │ ├── paimon_to_paimon_privilege1.conf │ │ │ ├── paimon_to_paimon_with_s3_with_privilege.conf │ │ │ ├── paimon_with_s3_to_assert.conf │ │ │ ├── read_from_paimon_with_hdfs_ha_to_assert.conf │ │ │ └── schema-0.json │ │ ├── connector-prometheus-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── prometheus/ │ │ │ │ ├── PrometheusIT.java │ │ │ │ └── VictoriaMetricsIT.java │ │ │ └── resources/ │ │ │ ├── VictoriaMetrics_instant_json_to_assert.conf │ │ │ ├── prometheus_instant_json_to_assert.conf │ │ │ ├── prometheus_range_json_to_assert.conf │ │ │ ├── prometheus_remote_write.conf │ │ │ └── victoriaMetrics_remote_write.conf │ │ ├── connector-pulsar-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── pulsar/ │ │ │ │ ├── CanalToPulsarIT.java │ │ │ │ ├── PulsarBatchIT.java │ │ │ │ └── PulsarSinkIT.java │ │ │ └── resources/ │ │ │ ├── batch_pulsar_to_console.conf │ │ │ ├── cdc_canal_pulsar_to_pg.conf │ │ │ ├── ddl/ │ │ │ │ └── canal.sql │ │ │ ├── fake_source.conf │ │ │ ├── fake_to_pulsar.conf │ │ │ ├── mysql/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ └── pulsar/ │ │ │ ├── canal-mysql-source-config.yaml │ │ │ └── start_canal_connector.sh │ │ ├── connector-qdrant-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── v2/ │ │ │ │ └── qdrant/ │ │ │ │ └── QdrantIT.java │ │ │ └── resources/ │ │ │ └── qdrant-to-qdrant.conf │ │ ├── connector-rabbitmq-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── rabbitmq/ │ │ │ │ └── RabbitmqIT.java │ │ │ └── resources/ │ │ │ ├── rabbitmq-to-rabbitmq-using-default-config.conf │ │ │ └── rabbitmq-to-rabbitmq.conf │ │ ├── connector-redis-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── redis/ │ │ │ │ ├── Redis5IT.java │ │ │ │ ├── Redis7IT.java │ │ │ │ ├── RedisClusterIT.java │ │ │ │ ├── RedisMasterAndSlaveIT.java │ │ │ │ └── RedisTestCaseTemplateIT.java │ │ │ └── resources/ │ │ │ ├── cluster-redis-to-redis-scan.conf │ │ │ ├── cluster-redis-to-redis-type-hash.conf │ │ │ ├── cluster-redis-to-redis-type-key.conf │ │ │ ├── cluster-redis-to-redis-type-list.conf │ │ │ ├── cluster-redis-to-redis-type-set.conf │ │ │ ├── cluster-redis-to-redis-type-zset.conf │ │ │ ├── fake-to-multipletableredissink.conf │ │ │ ├── fake-to-redis-test-in-real-time.conf │ │ │ ├── fake-to-redis-test-normal-key-is-null.conf │ │ │ ├── fake-to-redis-test-readonly-hash.conf │ │ │ ├── fake-to-redis-test-readonly-key.conf │ │ │ ├── fake-to-redis-test-readonly-list.conf │ │ │ ├── fake-to-redis-test-readonly-set.conf │ │ │ ├── fake-to-redis-test-readonly-zset.conf │ │ │ ├── redis-to-redis-by-db-num.conf │ │ │ ├── redis-to-redis-custom-hash-key-and-value.conf │ │ │ ├── redis-to-redis-custom-key.conf │ │ │ ├── redis-to-redis-custom-value-for-key.conf │ │ │ ├── redis-to-redis-custom-value-for-list.conf │ │ │ ├── redis-to-redis-custom-value-for-set.conf │ │ │ ├── redis-to-redis-custom-value-for-zset.conf │ │ │ ├── redis-to-redis-expire.conf │ │ │ ├── redis-to-redis.conf │ │ │ ├── scan-hash-to-redis-list-hash-check.conf │ │ │ ├── scan-hash-to-redis-with-default-key.conf │ │ │ ├── scan-hash-to-redis-with-key.conf │ │ │ ├── scan-list-test-read-to-redis-list-test-check.conf │ │ │ ├── scan-list-to-redis-list-with-key.conf │ │ │ ├── scan-redis-to-redis-with-key.conf │ │ │ ├── scan-set-to-redis-list-set-check.conf │ │ │ ├── scan-set-to-redis-list-set-with-key.conf │ │ │ ├── scan-string-to-redis-with-key.conf │ │ │ ├── scan-string-to-redis.conf │ │ │ ├── scan-zset-to-redis-list-zset-check.conf │ │ │ └── scan-zset-to-redis-list-zset-with-key.conf │ │ ├── connector-rocketmq-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── rocketmq/ │ │ │ │ ├── RocketMqConsumerMessage.java │ │ │ │ ├── RocketMqContainer.java │ │ │ │ └── RocketMqIT.java │ │ │ └── resources/ │ │ │ ├── log4j2-test.properties │ │ │ ├── rocketmq/ │ │ │ │ ├── rocketmq_source_earliest_to_console.conf │ │ │ │ ├── rocketmq_source_group_offset_to_console.conf │ │ │ │ ├── rocketmq_source_latest_to_console.conf │ │ │ │ ├── rocketmq_source_specific_offsets_to_console.conf │ │ │ │ └── rocketmq_source_timestamp_to_console.conf │ │ │ ├── rocketmq-sink_fake_to_rocketmq.conf │ │ │ ├── rocketmq-sink_fake_to_rocketmq_message_tag.conf │ │ │ ├── rocketmq-source_json_to_console.conf │ │ │ ├── rocketmq-source_tex_with_offset_check.conf │ │ │ ├── rocketmq-source_text_error_tag_to_console.conf │ │ │ ├── rocketmq-source_text_tag_to_console.conf │ │ │ ├── rocketmq-source_text_to_console.conf │ │ │ └── rocketmq-text-sink_fake_to_rocketmq.conf │ │ ├── connector-sensorsdata-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── sensorsdata/ │ │ │ │ └── sdk/ │ │ │ │ └── SensorsDataIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_sensorsdata_details.conf │ │ │ ├── fake_to_sensorsdata_events.conf │ │ │ └── fake_to_sensorsdata_users.conf │ │ ├── connector-sls-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── sls/ │ │ │ │ └── SlsIT.java │ │ │ └── resources/ │ │ │ ├── sls_sink_to_console.conf │ │ │ ├── sls_source_with_schema_to_console.conf │ │ │ └── sls_source_without_schema_to_console.conf │ │ ├── connector-starrocks-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── starrocks/ │ │ │ │ ├── StarRocksCDCSinkIT.java │ │ │ │ ├── StarRocksIT.java │ │ │ │ └── StarRocksSchemaChangeIT.java │ │ │ └── resources/ │ │ │ ├── ddl/ │ │ │ │ ├── add_columns.sql │ │ │ │ ├── change_columns.sql │ │ │ │ ├── drop_columns.sql │ │ │ │ ├── drop_columns_validate_schema.sql │ │ │ │ ├── modify_columns.sql │ │ │ │ └── shop.sql │ │ │ ├── docker/ │ │ │ │ ├── server-gtids/ │ │ │ │ │ └── my.cnf │ │ │ │ └── setup.sql │ │ │ ├── fake-to-starrocks.conf │ │ │ ├── mysqlcdc_to_starrocks_with_schema_change.conf │ │ │ ├── starrocks-thrift-to-starrocks-streamload.conf │ │ │ ├── starrocks-to-assert-with-multipletable.conf │ │ │ ├── starrocks-to-assert.conf │ │ │ └── write-cdc-changelog-to-starrocks.conf │ │ ├── connector-tdengine-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── tdengine/ │ │ │ │ └── TDengineIT.java │ │ │ └── resources/ │ │ │ └── tdengine/ │ │ │ ├── tdengine_fake_to_sink_multitable.conf │ │ │ ├── tdengine_source_to_sink.conf │ │ │ └── tdengine_source_to_sink_filter_by_fieldNames.conf │ │ ├── connector-typesense-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── e2e/ │ │ │ │ └── connector/ │ │ │ │ └── typesense/ │ │ │ │ └── TypesenseIT.java │ │ │ └── resources/ │ │ │ ├── fake_to_typesense_with_append_data.conf │ │ │ ├── fake_to_typesense_with_create_when_not_exists.conf │ │ │ ├── fake_to_typesense_with_drop_data.conf │ │ │ ├── fake_to_typesense_with_error_when_data_exists.conf │ │ │ ├── fake_to_typesense_with_error_when_not_exists.conf │ │ │ ├── fake_to_typesense_with_primary_keys.conf │ │ │ ├── fake_to_typesense_with_recreate_schema.conf │ │ │ ├── typesense_source_and_sink.conf │ │ │ ├── typesense_to_typesense.conf │ │ │ └── typesense_to_typesense_with_query.conf │ │ ├── connector-web3j-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org.apache.seatunnel.e2e.connector.google.firestore/ │ │ │ │ └── Web3jIT.java │ │ │ └── resources/ │ │ │ └── firestore/ │ │ │ └── web3j_to_assert.conf │ │ └── pom.xml │ ├── seatunnel-core-e2e/ │ │ ├── pom.xml │ │ └── seatunnel-starter-e2e/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── core/ │ │ │ └── starter/ │ │ │ └── seatunnel/ │ │ │ ├── SeaTunnelConnectorBatchCancelTest.java │ │ │ └── SeaTunnelConnectorTest.java │ │ └── resources/ │ │ ├── batch_cancel_task_1.conf │ │ └── batch_cancel_task_2.conf │ ├── seatunnel-e2e-common/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── e2e/ │ │ │ ├── common/ │ │ │ │ ├── AbstractFlinkContainer.java │ │ │ │ ├── AbstractSparkContainer.java │ │ │ │ ├── TestResource.java │ │ │ │ ├── TestSuiteBase.java │ │ │ │ ├── container/ │ │ │ │ │ ├── AbstractTestContainer.java │ │ │ │ │ ├── ContainerExtendedFactory.java │ │ │ │ │ ├── EngineType.java │ │ │ │ │ ├── TestContainer.java │ │ │ │ │ ├── TestContainerId.java │ │ │ │ │ ├── TestContainersFactory.java │ │ │ │ │ ├── TestHelper.java │ │ │ │ │ ├── flink/ │ │ │ │ │ │ ├── AbstractTestFlinkContainer.java │ │ │ │ │ │ ├── Flink13Container.java │ │ │ │ │ │ ├── Flink14Container.java │ │ │ │ │ │ ├── Flink15Container.java │ │ │ │ │ │ ├── Flink16Container.java │ │ │ │ │ │ ├── Flink17Container.java │ │ │ │ │ │ ├── Flink18Container.java │ │ │ │ │ │ └── Flink20Container.java │ │ │ │ │ ├── seatunnel/ │ │ │ │ │ │ ├── ConnectorPackageServiceContainer.java │ │ │ │ │ │ └── SeaTunnelContainer.java │ │ │ │ │ └── spark/ │ │ │ │ │ ├── AbstractTestSparkContainer.java │ │ │ │ │ ├── Spark2Container.java │ │ │ │ │ └── Spark3Container.java │ │ │ │ ├── junit/ │ │ │ │ │ ├── AnnotationUtil.java │ │ │ │ │ ├── ContainerTestingExtension.java │ │ │ │ │ ├── DisabledOnContainer.java │ │ │ │ │ ├── TestCaseInvocationContextProvider.java │ │ │ │ │ ├── TestContainerExtension.java │ │ │ │ │ ├── TestContainers.java │ │ │ │ │ ├── TestLoggerExtension.java │ │ │ │ │ └── TimingExtension.java │ │ │ │ └── util/ │ │ │ │ ├── ConfigAdapterUtils.java │ │ │ │ ├── ConfigBuilder.java │ │ │ │ ├── ContainerUtil.java │ │ │ │ ├── JdbcUtil.java │ │ │ │ └── JobIdGenerator.java │ │ │ ├── sink/ │ │ │ │ └── inmemory/ │ │ │ │ ├── InMemoryAggregatedCommitInfo.java │ │ │ │ ├── InMemoryAggregatedCommitter.java │ │ │ │ ├── InMemoryCommitInfo.java │ │ │ │ ├── InMemoryConnection.java │ │ │ │ ├── InMemoryMultiTableResourceManager.java │ │ │ │ ├── InMemorySaveModeHandler.java │ │ │ │ ├── InMemorySink.java │ │ │ │ ├── InMemorySinkFactory.java │ │ │ │ ├── InMemorySinkWriter.java │ │ │ │ └── InMemoryState.java │ │ │ └── source/ │ │ │ └── inmemory/ │ │ │ ├── InMemorySource.java │ │ │ ├── InMemorySourceFactory.java │ │ │ ├── InMemorySourceReader.java │ │ │ ├── InMemorySourceSplit.java │ │ │ ├── InMemorySourceSplitEnumerator.java │ │ │ └── InMemoryState.java │ │ └── resources/ │ │ ├── junit-platform.properties │ │ └── log4j2.properties │ ├── seatunnel-engine-e2e/ │ │ ├── connector-console-seatunnel-e2e/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── e2e/ │ │ │ │ └── console/ │ │ │ │ ├── FakeSourceToConsoleIT.java │ │ │ │ └── FakeSourceToConsoleWithEventReportIT.java │ │ │ └── resources/ │ │ │ ├── fakesource_to_console.conf │ │ │ └── seatunnel_config_with_event_report.yaml │ │ ├── connector-seatunnel-e2e-base/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── test/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── e2e/ │ │ │ │ ├── BasicAuthenticationIT.java │ │ │ │ ├── CheckpointEnableIT.java │ │ │ │ ├── ClusterFaultToleranceIT.java │ │ │ │ ├── ClusterFaultToleranceTwoPipelineIT.java │ │ │ │ ├── ClusterIT.java │ │ │ │ ├── ClusterSeaTunnelEngineContainer.java │ │ │ │ ├── CommittedMetricsIT.java │ │ │ │ ├── ConnectorPackageServiceContainer.java │ │ │ │ ├── ConnectorPackageServiceIT.java │ │ │ │ ├── JobClientJobProxyIT.java │ │ │ │ ├── JobExecutionIT.java │ │ │ │ ├── JobRestoreIT.java │ │ │ │ ├── LocalModeIT.java │ │ │ │ ├── MultiTableMetricsIT.java │ │ │ │ ├── PendingJobsRestIT.java │ │ │ │ ├── RestApiIT.java │ │ │ │ ├── SeaTunnelEngineContainer.java │ │ │ │ ├── SeaTunnelSlotIT.java │ │ │ │ ├── SinkPlaceholderIT.java │ │ │ │ ├── SplitClusterFaultToleranceIT.java │ │ │ │ ├── TestUtils.java │ │ │ │ ├── TextHeaderIT.java │ │ │ │ ├── UnifyEnvParameterIT.java │ │ │ │ ├── UserVariableIT.java │ │ │ │ ├── allocatestrategy/ │ │ │ │ │ ├── SlotRatioAllocateStrategyIT.java │ │ │ │ │ └── SystemLoadAllocateStrategyIT.java │ │ │ │ ├── classloader/ │ │ │ │ │ ├── ClassLoaderDisableCacheModeIT.java │ │ │ │ │ ├── ClassLoaderEnableCacheModeIT.java │ │ │ │ │ └── ClassLoaderITBase.java │ │ │ │ ├── joblog/ │ │ │ │ │ └── JobLogIT.java │ │ │ │ ├── resourceIsolation/ │ │ │ │ │ ├── ResourceIsolationIT.java │ │ │ │ │ └── WorkerTagClusterTest.java │ │ │ │ └── telemetry/ │ │ │ │ └── MasterWorkerClusterSeaTunnelWithTelemetryIT.java │ │ │ └── resources/ │ │ │ ├── allocate-strategy/ │ │ │ │ ├── allocate_strategy_no_tag_with_system_load.conf │ │ │ │ ├── allocate_strategy_tag1_with_system_load.conf │ │ │ │ ├── allocate_strategy_tag2_with_system_load.conf │ │ │ │ └── allocate_strategy_with_slot_ratio.conf │ │ │ ├── basic-auth/ │ │ │ │ └── seatunnel.yaml │ │ │ ├── batch_fake_multi_table_to_console.conf │ │ │ ├── batch_fakesource_to_console_error.conf │ │ │ ├── batch_fakesource_to_file.conf │ │ │ ├── batch_fakesource_to_file_complex.conf │ │ │ ├── batch_fakesource_to_file_header.conf │ │ │ ├── batch_last_checkpoint_error.conf │ │ │ ├── batch_slot_not_enough.conf │ │ │ ├── checkpoint-batch-disable-test-resources/ │ │ │ │ ├── batch_fakesource_to_localfile_checkpoint_disable.conf │ │ │ │ ├── batch_fakesource_to_localfile_checkpoint_disable_withtimeout.conf │ │ │ │ └── sink_file_text_to_assert.conf │ │ │ ├── checkpoint-batch-enable-test-resources/ │ │ │ │ ├── batch_fakesource_to_localfile_checkpoint_enable.conf │ │ │ │ └── sink_file_text_to_assert.conf │ │ │ ├── checkpoint-streaming-enable-test-resources/ │ │ │ │ ├── sink_file_text_to_assert.conf │ │ │ │ ├── stream_fakesource_to_localfile.conf │ │ │ │ └── stream_fakesource_to_localfile_interval.conf │ │ │ ├── classloader/ │ │ │ │ ├── fake_to_inmemory.conf │ │ │ │ ├── seatunnel_cache_mode.yaml │ │ │ │ └── seatunnel_disable_cache_mode.yaml │ │ │ ├── cluster/ │ │ │ │ ├── hazelcast.yaml │ │ │ │ └── seatunnel.yaml │ │ │ ├── cluster_batch_fake_to_localfile_template.conf │ │ │ ├── cluster_batch_fake_to_localfile_two_pipeline_template.conf │ │ │ ├── connector-package-service-test-server1-resources/ │ │ │ │ ├── fakesource_to_console.conf │ │ │ │ ├── hazelcast-client.yaml │ │ │ │ ├── hazelcast.yaml │ │ │ │ ├── junit-platform.properties │ │ │ │ ├── jvm_client_options │ │ │ │ ├── jvm_options │ │ │ │ ├── log4j2-test.properties │ │ │ │ ├── log4j2.properties │ │ │ │ └── seatunnel.yaml │ │ │ ├── connector-package-service-test-server2-resources/ │ │ │ │ ├── fakesource_to_console.conf │ │ │ │ ├── hazelcast-client.yaml │ │ │ │ ├── hazelcast.yaml │ │ │ │ ├── junit-platform.properties │ │ │ │ ├── jvm_client_options │ │ │ │ ├── jvm_options │ │ │ │ ├── log4j2-test.properties │ │ │ │ ├── log4j2.properties │ │ │ │ └── seatunnel.yaml │ │ │ ├── connector-package-service-test-server3-resources/ │ │ │ │ ├── fakesource_to_console.conf │ │ │ │ ├── hazelcast-client.yaml │ │ │ │ ├── hazelcast.yaml │ │ │ │ ├── junit-platform.properties │ │ │ │ ├── jvm_client_options │ │ │ │ ├── jvm_options │ │ │ │ ├── log4j2-test.properties │ │ │ │ ├── log4j2.properties │ │ │ │ └── seatunnel.yaml │ │ │ ├── fake-and-inmemory/ │ │ │ │ └── plugin-mapping.properties │ │ │ ├── fake_to_console.variables.conf │ │ │ ├── fake_to_console_with_default_value.variables.conf │ │ │ ├── fake_to_inmemory_with_sink_placeholder.conf │ │ │ ├── fakesource_to_console.conf │ │ │ ├── hazelcast-client.yaml │ │ │ ├── hazelcast.yaml │ │ │ ├── job-log-file/ │ │ │ │ └── log4j2.properties │ │ │ ├── junit-platform.properties │ │ │ ├── jvm_client_options │ │ │ ├── jvm_options │ │ │ ├── log4j2-test.properties │ │ │ ├── log4j2.properties │ │ │ ├── master-worker-cluster/ │ │ │ │ ├── hazelcast-master.yaml │ │ │ │ ├── hazelcast-worker.yaml │ │ │ │ ├── jvm_master_options │ │ │ │ ├── jvm_worker_options │ │ │ │ └── seatunnel.yaml │ │ │ ├── pending_jobs_streaming.conf │ │ │ ├── resource-isolation/ │ │ │ │ ├── fakesource_to_console.conf │ │ │ │ └── fakesource_to_console_tag_not_match.conf │ │ │ ├── restore-job/ │ │ │ │ └── restore_job_apply_resources.conf │ │ │ ├── retry-times/ │ │ │ │ ├── stream_fake_to_inmemory_with_error.conf │ │ │ │ └── stream_fake_to_inmemory_with_error_retry_1.conf │ │ │ ├── savemode/ │ │ │ │ ├── fake_to_inmemory_savemode.conf │ │ │ │ └── fake_to_inmemory_savemode_client.conf │ │ │ ├── seatunnel.yaml │ │ │ ├── seatunnel_fixed_slot_num.yaml │ │ │ ├── seatunnel_job_restore_apply_resources.yaml │ │ │ ├── stream_fake_multi_table_to_console_with_checkpoint.conf │ │ │ ├── stream_fake_to_inmemory_with_runtime_list.conf │ │ │ ├── stream_fake_to_inmemory_with_throwable_error.conf │ │ │ ├── stream_fakesource_to_console.conf │ │ │ ├── stream_fakesource_to_file.conf │ │ │ ├── stream_fakesource_to_inmemory_pending_row_in_queue.conf │ │ │ ├── streaming_fakesource_to_file_complex.conf │ │ │ ├── unify-env-param-test-resource/ │ │ │ │ ├── outdated_env_param_fakesource_to_localfile.conf │ │ │ │ ├── unify_env_param_fakesource_to_localfile.conf │ │ │ │ └── unify_flink_table_env_param_fakesource_to_console.conf │ │ │ ├── upload-file/ │ │ │ │ ├── fake_to_console.conf │ │ │ │ └── fake_to_console.json │ │ │ └── valid_job_name.conf │ │ ├── pom.xml │ │ └── seatunnel-engine-k8s-e2e/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── e2e/ │ │ │ └── k8s/ │ │ │ └── KubernetesIT.java │ │ └── resources/ │ │ ├── custom_config/ │ │ │ ├── hazelcast-client.yaml │ │ │ ├── hazelcast-kubernetes-discovery.yaml │ │ │ ├── hazelcast-tcp-discovery.yaml │ │ │ └── plugin-mapping.properties │ │ ├── seatunnel-service.yaml │ │ ├── seatunnel-statefulset.yaml │ │ └── seatunnel_dockerfile │ └── seatunnel-transforms-v2-e2e/ │ ├── pom.xml │ ├── seatunnel-transforms-v2-e2e-common/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── e2e/ │ │ └── transform/ │ │ └── TestSuiteBase.java │ ├── seatunnel-transforms-v2-e2e-part-1/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── e2e/ │ │ │ └── transform/ │ │ │ ├── TestCopyIT.java │ │ │ ├── TestDataValidatorIT.java │ │ │ ├── TestEmbeddingIT.java │ │ │ ├── TestFilterIT.java │ │ │ ├── TestFilterRowKindIT.java │ │ │ ├── TestLLMIT.java │ │ │ ├── TestRowKindExtractorTransformIT.java │ │ │ └── TestSplitIT.java │ │ └── resources/ │ │ ├── copy_transform.conf │ │ ├── copy_transform_multi_table.conf │ │ ├── data_validator_email_udf.conf │ │ ├── data_validator_fail.conf │ │ ├── data_validator_route_to_table.conf │ │ ├── data_validator_route_to_table_with_db_prefix.conf │ │ ├── data_validator_skip.conf │ │ ├── data_validator_valid.conf │ │ ├── embedding_transform.conf │ │ ├── embedding_transform_binary.conf │ │ ├── embedding_transform_binary_complete_file.conf │ │ ├── embedding_transform_custom.conf │ │ ├── embedding_transform_multi_table.conf │ │ ├── embedding_transform_multimodal.conf │ │ ├── filter_row_kind_exclude_delete.conf │ │ ├── filter_row_kind_exclude_insert.conf │ │ ├── filter_row_kind_exclude_insert_multi_table.conf │ │ ├── filter_row_kind_include_insert.conf │ │ ├── filter_row_to_next_transform.json │ │ ├── filter_transform.conf │ │ ├── filter_transform_multi_table.conf │ │ ├── llm_kimiai_transform.conf │ │ ├── llm_microsoft_transform.conf │ │ ├── llm_openai_transform.conf │ │ ├── llm_openai_transform_boolean.conf │ │ ├── llm_openai_transform_columns.conf │ │ ├── llm_openai_transform_custom_output_name.conf │ │ ├── llm_openai_transform_multi_table.conf │ │ ├── llm_transform_custom.conf │ │ ├── mock-embedding.json │ │ ├── mockserver-config.json │ │ ├── rowkind_extractor_transform_case1.conf │ │ ├── rowkind_extractor_transform_case1_multi_table.conf │ │ ├── rowkind_extractor_transform_case2.conf │ │ ├── split_transform.conf │ │ └── split_transform_multi_table.conf │ ├── seatunnel-transforms-v2-e2e-part-2/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── e2e/ │ │ │ └── transform/ │ │ │ ├── TestDynamicCompileIT.java │ │ │ ├── TestFieldEncryptIT.java │ │ │ ├── TestFieldMapperIT.java │ │ │ ├── TestJsonPathTransformIT.java │ │ │ ├── TestMetadataIT.java │ │ │ ├── TestRegexExtractIT.java │ │ │ ├── TestRenameIT.java │ │ │ ├── TestReplaceIT.java │ │ │ ├── TestSQLIT.java │ │ │ ├── TestSparkDateTimeTransformIT.java │ │ │ ├── TestTableFilterIT.java │ │ │ └── TestTableMergeIT.java │ │ └── resources/ │ │ ├── dynamic_compile/ │ │ │ ├── conf/ │ │ │ │ ├── mixed_dynamic_all_compile_transform.conf │ │ │ │ ├── mixed_dynamic_groovy_java_compile_transform.conf │ │ │ │ ├── mixed_dynamic_groovy_scala_compile_transform.conf │ │ │ │ ├── mixed_dynamic_java_scala_compile_transform.conf │ │ │ │ ├── mockserver-config.json │ │ │ │ ├── multiple_dynamic_groovy_compile_transform.conf │ │ │ │ ├── multiple_dynamic_java_compile_transform.conf │ │ │ │ ├── multiple_dynamic_scala_compile_transform.conf │ │ │ │ ├── single_dynamic_groovy_compile_transform.conf │ │ │ │ ├── single_dynamic_http_compile_transform.conf │ │ │ │ ├── single_dynamic_java_compile_transform.conf │ │ │ │ ├── single_dynamic_java_compile_transform_compatible.conf │ │ │ │ ├── single_dynamic_java_compile_transform_multi_table.conf │ │ │ │ ├── single_dynamic_scala_compile_transform.conf │ │ │ │ ├── single_groovy_path_compile.conf │ │ │ │ ├── single_java_path_compile.conf │ │ │ │ └── single_scala_path_compile.conf │ │ │ └── source_file/ │ │ │ ├── GroovyFile │ │ │ ├── JavaFile │ │ │ └── ScalaFile │ │ ├── field_decrypt_transform.conf │ │ ├── field_decrypt_transform_multi_table.conf │ │ ├── field_encrypt_transform.conf │ │ ├── field_encrypt_transform_multi_table.conf │ │ ├── field_mapper_transform.conf │ │ ├── field_mapper_transform_multi_table.conf │ │ ├── field_mapper_transform_without_result_table.conf │ │ ├── field_rename_regex_default.conf │ │ ├── json_path_transform/ │ │ │ ├── array_test.conf │ │ │ ├── json_path_array_map.conf │ │ │ ├── json_path_basic_type_test.conf │ │ │ ├── json_path_basic_type_test_multi_table.conf │ │ │ ├── json_path_batch_fields_test.conf │ │ │ ├── json_path_with_error_handle_way.conf │ │ │ └── nested_row_test.conf │ │ ├── metadata_multi_table.conf │ │ ├── regexextract/ │ │ │ ├── regex_extract_transform.conf │ │ │ └── regex_extract_transform_multi_table.conf │ │ ├── replace_transform.conf │ │ ├── replace_transform_multi_table.conf │ │ ├── spark_date_time_transform.conf │ │ ├── sql_transform/ │ │ │ ├── binary_expression.conf │ │ │ ├── case_when.conf │ │ │ ├── criteria_filter.conf │ │ │ ├── explode_transform.conf │ │ │ ├── explode_transform_with_outer.conf │ │ │ ├── explode_transform_without_outer.conf │ │ │ ├── func_array.conf │ │ │ ├── func_array_max_min.conf │ │ │ ├── func_datetime.conf │ │ │ ├── func_from_unixtime.conf │ │ │ ├── func_multi_if.conf │ │ │ ├── func_null_return.conf │ │ │ ├── func_numeric.conf │ │ │ ├── func_split.conf │ │ │ ├── func_string.conf │ │ │ ├── func_system.conf │ │ │ ├── func_vector.conf │ │ │ ├── inner_query.conf │ │ │ ├── nested_type.conf │ │ │ └── sql_all_columns.conf │ │ ├── sql_transform.conf │ │ ├── sql_transform_multi_table.conf │ │ ├── table_field_rename_multi_table.conf │ │ ├── table_filter_multi_table.conf │ │ ├── table_filter_multi_table_with_exclude_mode.conf │ │ └── table_merge_multi_table.conf │ ├── seatunnel-transforms-v2-e2e-udf/ │ │ ├── pom.xml │ │ └── src/ │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── e2e/ │ │ │ └── transform/ │ │ │ └── udf/ │ │ │ └── ExampleUdfIT.java │ │ └── resources/ │ │ ├── custom_udf.conf │ │ └── custom_udf_context_lifecycle.conf │ └── seatunnel-transforms-v2-udf/ │ ├── pom.xml │ └── src/ │ └── main/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── e2e/ │ └── transform/ │ └── udf/ │ ├── EncryptUDF.java │ └── ExampleUdf.java ├── seatunnel-engine/ │ ├── README.md │ ├── pom.xml │ ├── seatunnel-engine-client/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── client/ │ │ │ ├── SeaTunnelClient.java │ │ │ ├── SeaTunnelClientInstance.java │ │ │ ├── SeaTunnelHazelcastClient.java │ │ │ ├── job/ │ │ │ │ ├── ClientJobExecutionEnvironment.java │ │ │ │ ├── ClientJobProxy.java │ │ │ │ ├── ConnectorPackageClient.java │ │ │ │ ├── JobClient.java │ │ │ │ ├── JobMetricsRunner.java │ │ │ │ └── JobStatusRunner.java │ │ │ └── util/ │ │ │ └── ContentFormatUtil.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── client/ │ │ │ ├── ConnectorPackageClientTest.java │ │ │ ├── ContentFormatUtilTest.java │ │ │ ├── JobClientTest.java │ │ │ ├── LogicalDagGeneratorTest.java │ │ │ ├── MultipleTableJobConfigParserTest.java │ │ │ ├── SeaTunnelClientTest.java │ │ │ └── SeaTunnelEngineClusterRoleTest.java │ │ └── resources/ │ │ ├── batch_fake_multi_table_to_console.conf │ │ ├── batch_fake_to_console.conf │ │ ├── batch_fake_to_console_multi_table.conf │ │ ├── batch_fake_to_console_with_duplicated_transform.conf │ │ ├── batch_fake_to_console_with_error_env_option.conf │ │ ├── batch_fakesource_to_file.conf │ │ ├── batch_fakesource_to_file_complex.conf │ │ ├── batch_fakesource_to_two_file.conf │ │ ├── client_test.conf │ │ ├── client_test_with_jars.conf │ │ ├── custmoize-client.yaml │ │ ├── hazelcast-client.yaml │ │ ├── hazelcast.yaml │ │ ├── log4j2-test.properties │ │ ├── seatunnel.yaml │ │ └── streaming_fake_to_console.conf │ ├── seatunnel-engine-common/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── common/ │ │ │ │ ├── Constant.java │ │ │ │ ├── config/ │ │ │ │ │ ├── ConfigProvider.java │ │ │ │ │ ├── EngineConfig.java │ │ │ │ │ ├── JobConfig.java │ │ │ │ │ ├── SeaTunnelClientConfig.java │ │ │ │ │ ├── SeaTunnelConfig.java │ │ │ │ │ ├── SeaTunnelConfigSections.java │ │ │ │ │ ├── SeaTunnelProperties.java │ │ │ │ │ ├── YamlSeaTunnelConfigBuilder.java │ │ │ │ │ ├── YamlSeaTunnelConfigLocator.java │ │ │ │ │ ├── YamlSeaTunnelDomConfigProcessor.java │ │ │ │ │ └── server/ │ │ │ │ │ ├── AllocateStrategy.java │ │ │ │ │ ├── CheckpointConfig.java │ │ │ │ │ ├── CheckpointStorageConfig.java │ │ │ │ │ ├── ConnectorJarHAStorageConfig.java │ │ │ │ │ ├── ConnectorJarStorageConfig.java │ │ │ │ │ ├── ConnectorJarStorageMode.java │ │ │ │ │ ├── CoordinatorServiceConfig.java │ │ │ │ │ ├── HttpConfig.java │ │ │ │ │ ├── QueueType.java │ │ │ │ │ ├── ScheduleStrategy.java │ │ │ │ │ ├── ServerConfigOptions.java │ │ │ │ │ ├── SlotServiceConfig.java │ │ │ │ │ ├── TelemetryConfig.java │ │ │ │ │ ├── TelemetryLogsConfig.java │ │ │ │ │ ├── TelemetryMetricConfig.java │ │ │ │ │ └── ThreadShareMode.java │ │ │ │ ├── env/ │ │ │ │ │ ├── EnvironmentUtil.java │ │ │ │ │ └── Version.java │ │ │ │ ├── exception/ │ │ │ │ │ ├── ClassLoaderErrorCode.java │ │ │ │ │ ├── ClassLoaderException.java │ │ │ │ │ ├── JobCanceledException.java │ │ │ │ │ ├── JobDefineCheckException.java │ │ │ │ │ ├── JobException.java │ │ │ │ │ ├── JobFailedException.java │ │ │ │ │ ├── JobNoEnoughResourceException.java │ │ │ │ │ ├── JobNotFoundException.java │ │ │ │ │ ├── SavePointFailedException.java │ │ │ │ │ ├── SchedulerNotAllowException.java │ │ │ │ │ ├── SeaTunnelEngineException.java │ │ │ │ │ ├── SeaTunnelEngineRetryableException.java │ │ │ │ │ └── TaskGroupDeployException.java │ │ │ │ ├── job/ │ │ │ │ │ ├── JobResult.java │ │ │ │ │ ├── JobStateEvent.java │ │ │ │ │ ├── JobStatus.java │ │ │ │ │ └── JobStatusData.java │ │ │ │ ├── loader/ │ │ │ │ │ ├── SeaTunnelBaseClassLoader.java │ │ │ │ │ ├── SeaTunnelChildFirstClassLoader.java │ │ │ │ │ └── SeaTunnelParentFirstClassLoader.java │ │ │ │ ├── runtime/ │ │ │ │ │ ├── DeployType.java │ │ │ │ │ └── ExecutionMode.java │ │ │ │ ├── serializeable/ │ │ │ │ │ ├── ConfigDataSerializerHook.java │ │ │ │ │ └── SeaTunnelFactoryIdConstant.java │ │ │ │ └── utils/ │ │ │ │ ├── ExceptionUtil.java │ │ │ │ ├── FactoryUtil.java │ │ │ │ ├── IdGenerator.java │ │ │ │ ├── LogUtil.java │ │ │ │ ├── MDUtil.java │ │ │ │ ├── PassiveCompletableFuture.java │ │ │ │ └── concurrent/ │ │ │ │ └── CompletableFuture.java │ │ │ ├── resources/ │ │ │ │ ├── META-INF/ │ │ │ │ │ └── services/ │ │ │ │ │ └── com.hazelcast.DataSerializerHook │ │ │ │ ├── hazelcast-client.yaml │ │ │ │ ├── hazelcast.yaml │ │ │ │ ├── jvm_options │ │ │ │ └── seatunnel.yaml │ │ │ └── resources-filtered/ │ │ │ └── zeta.version.properties │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── common/ │ │ │ ├── config/ │ │ │ │ ├── EnvironmentUtilTest.java │ │ │ │ └── YamlSeaTunnelConfigParserTest.java │ │ │ └── utils/ │ │ │ ├── ExceptionUtilTest.java │ │ │ └── concurrent/ │ │ │ └── CompletableFutureTest.java │ │ └── resources/ │ │ ├── customize-client.yaml │ │ ├── customize-seatunnel.yaml │ │ ├── hazelcast-client.yaml │ │ ├── hazelcast.yaml │ │ ├── seatunnel-https.yaml │ │ └── seatunnel.yaml │ ├── seatunnel-engine-core/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── core/ │ │ │ │ ├── checkpoint/ │ │ │ │ │ ├── Checkpoint.java │ │ │ │ │ ├── CheckpointCounts.java │ │ │ │ │ ├── CheckpointHistoryEntry.java │ │ │ │ │ ├── CheckpointIDCounter.java │ │ │ │ │ ├── CheckpointInfo.java │ │ │ │ │ ├── CheckpointOverview.java │ │ │ │ │ ├── CheckpointStatus.java │ │ │ │ │ ├── CheckpointType.java │ │ │ │ │ ├── InProgressCheckpoint.java │ │ │ │ │ ├── InternalCheckpointListener.java │ │ │ │ │ └── PipelineCheckpointOverview.java │ │ │ │ ├── classloader/ │ │ │ │ │ ├── ClassLoaderService.java │ │ │ │ │ └── DefaultClassLoaderService.java │ │ │ │ ├── dag/ │ │ │ │ │ ├── actions/ │ │ │ │ │ │ ├── AbstractAction.java │ │ │ │ │ │ ├── Action.java │ │ │ │ │ │ ├── ActionUtils.java │ │ │ │ │ │ ├── Config.java │ │ │ │ │ │ ├── SinkAction.java │ │ │ │ │ │ ├── SinkConfig.java │ │ │ │ │ │ ├── SourceAction.java │ │ │ │ │ │ ├── TransformAction.java │ │ │ │ │ │ ├── TransformChainAction.java │ │ │ │ │ │ └── UnknownActionException.java │ │ │ │ │ ├── internal/ │ │ │ │ │ │ └── IntermediateQueue.java │ │ │ │ │ └── logical/ │ │ │ │ │ ├── LogicalDag.java │ │ │ │ │ ├── LogicalDagGenerator.java │ │ │ │ │ ├── LogicalEdge.java │ │ │ │ │ └── LogicalVertex.java │ │ │ │ ├── job/ │ │ │ │ │ ├── AbstractJobEnvironment.java │ │ │ │ │ ├── CommonPluginJar.java │ │ │ │ │ ├── ConnectorJar.java │ │ │ │ │ ├── ConnectorJarIdentifier.java │ │ │ │ │ ├── ConnectorJarType.java │ │ │ │ │ ├── ConnectorPluginJar.java │ │ │ │ │ ├── Edge.java │ │ │ │ │ ├── ExecutionAddress.java │ │ │ │ │ ├── Job.java │ │ │ │ │ ├── JobDAGInfo.java │ │ │ │ │ ├── JobImmutableInformation.java │ │ │ │ │ ├── JobInfo.java │ │ │ │ │ ├── JobPipelineCheckpointData.java │ │ │ │ │ ├── PipelineExecutionState.java │ │ │ │ │ ├── PipelineStatus.java │ │ │ │ │ ├── RefCount.java │ │ │ │ │ ├── StatusUpdate.java │ │ │ │ │ └── VertexInfo.java │ │ │ │ ├── parse/ │ │ │ │ │ ├── ConfigParserUtil.java │ │ │ │ │ ├── JobConfigParser.java │ │ │ │ │ └── MultipleTableJobConfigParser.java │ │ │ │ ├── protocol/ │ │ │ │ │ └── codec/ │ │ │ │ │ ├── SeaTunnelCancelJobCodec.java │ │ │ │ │ ├── SeaTunnelGetCheckpointHistoryCodec.java │ │ │ │ │ ├── SeaTunnelGetCheckpointOverviewCodec.java │ │ │ │ │ ├── SeaTunnelGetClusterHealthMetricsCodec.java │ │ │ │ │ ├── SeaTunnelGetJobCheckpointCodec.java │ │ │ │ │ ├── SeaTunnelGetJobDetailStatusCodec.java │ │ │ │ │ ├── SeaTunnelGetJobInfoCodec.java │ │ │ │ │ ├── SeaTunnelGetJobMetricsCodec.java │ │ │ │ │ ├── SeaTunnelGetJobStatusCodec.java │ │ │ │ │ ├── SeaTunnelGetRunningJobMetricsCodec.java │ │ │ │ │ ├── SeaTunnelListJobStatusCodec.java │ │ │ │ │ ├── SeaTunnelPrintMessageCodec.java │ │ │ │ │ ├── SeaTunnelSavePointJobCodec.java │ │ │ │ │ ├── SeaTunnelSubmitJobCodec.java │ │ │ │ │ ├── SeaTunnelUploadConnectorJarCodec.java │ │ │ │ │ └── SeaTunnelWaitForJobCompleteCodec.java │ │ │ │ └── serializable/ │ │ │ │ └── JobDataSerializerHook.java │ │ │ └── resources/ │ │ │ ├── META-INF/ │ │ │ │ └── services/ │ │ │ │ └── com.hazelcast.DataSerializerHook │ │ │ ├── client-protocol-definition/ │ │ │ │ └── SeaTunnelEngine.yaml │ │ │ └── generate_client_protocol.sh │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── engine/ │ │ └── core/ │ │ └── classloader/ │ │ ├── AbstractClassLoaderServiceTest.java │ │ ├── ClassLoaderServiceCacheModeTest.java │ │ └── ClassLoaderServiceTest.java │ ├── seatunnel-engine-serializer/ │ │ ├── pom.xml │ │ ├── serializer-api/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── serializer/ │ │ │ └── api/ │ │ │ └── Serializer.java │ │ └── serializer-protobuf/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── serializer/ │ │ │ └── protobuf/ │ │ │ └── ProtoStuffSerializer.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── engine/ │ │ └── serializer/ │ │ └── protobuf/ │ │ └── ProtoStuffSerializerTest.java │ ├── seatunnel-engine-server/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── server/ │ │ │ │ ├── CheckpointService.java │ │ │ │ ├── CoordinatorService.java │ │ │ │ ├── EventService.java │ │ │ │ ├── JettyService.java │ │ │ │ ├── NodeExtension.java │ │ │ │ ├── NodeExtensionCommon.java │ │ │ │ ├── SeaTunnelHealthMonitor.java │ │ │ │ ├── SeaTunnelNodeContext.java │ │ │ │ ├── SeaTunnelServer.java │ │ │ │ ├── SeaTunnelServerStarter.java │ │ │ │ ├── TaskExecutionService.java │ │ │ │ ├── checkpoint/ │ │ │ │ │ ├── ActionState.java │ │ │ │ │ ├── ActionStateKey.java │ │ │ │ │ ├── ActionSubtaskState.java │ │ │ │ │ ├── CheckpointBarrier.java │ │ │ │ │ ├── CheckpointCloseReason.java │ │ │ │ │ ├── CheckpointCoordinator.java │ │ │ │ │ ├── CheckpointCoordinatorState.java │ │ │ │ │ ├── CheckpointCoordinatorStatus.java │ │ │ │ │ ├── CheckpointException.java │ │ │ │ │ ├── CheckpointManager.java │ │ │ │ │ ├── CheckpointPlan.java │ │ │ │ │ ├── CompletedCheckpoint.java │ │ │ │ │ ├── IMapCheckpointIDCounter.java │ │ │ │ │ ├── PendingCheckpoint.java │ │ │ │ │ ├── StandaloneCheckpointIDCounter.java │ │ │ │ │ ├── Stateful.java │ │ │ │ │ ├── SubtaskStatistics.java │ │ │ │ │ ├── SubtaskStatus.java │ │ │ │ │ ├── TaskStatistics.java │ │ │ │ │ ├── monitor/ │ │ │ │ │ │ └── CheckpointMonitorService.java │ │ │ │ │ └── operation/ │ │ │ │ │ ├── CheckpointBarrierTriggerOperation.java │ │ │ │ │ ├── CheckpointEndOperation.java │ │ │ │ │ ├── CheckpointErrorReportOperation.java │ │ │ │ │ ├── CheckpointFinishedOperation.java │ │ │ │ │ ├── NotifyTaskRestoreOperation.java │ │ │ │ │ ├── NotifyTaskStartOperation.java │ │ │ │ │ ├── TaskAcknowledgeOperation.java │ │ │ │ │ ├── TaskReportStatusOperation.java │ │ │ │ │ ├── TriggerSchemaChangeAfterCheckpointOperation.java │ │ │ │ │ └── TriggerSchemaChangeBeforeCheckpointOperation.java │ │ │ │ ├── dag/ │ │ │ │ │ ├── DAGUtils.java │ │ │ │ │ ├── execution/ │ │ │ │ │ │ ├── ExecutionEdge.java │ │ │ │ │ │ ├── ExecutionPlan.java │ │ │ │ │ │ ├── ExecutionPlanGenerator.java │ │ │ │ │ │ ├── ExecutionVertex.java │ │ │ │ │ │ ├── Pipeline.java │ │ │ │ │ │ └── PipelineGenerator.java │ │ │ │ │ └── physical/ │ │ │ │ │ ├── PhysicalPlan.java │ │ │ │ │ ├── PhysicalPlanGenerator.java │ │ │ │ │ ├── PhysicalVertex.java │ │ │ │ │ ├── PipelineLocation.java │ │ │ │ │ ├── PlanUtils.java │ │ │ │ │ ├── ResourceUtils.java │ │ │ │ │ ├── SubPlan.java │ │ │ │ │ ├── UnknownPhysicalPlanException.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── FlowConfig.java │ │ │ │ │ │ ├── IntermediateQueueConfig.java │ │ │ │ │ │ ├── SinkConfig.java │ │ │ │ │ │ └── SourceConfig.java │ │ │ │ │ └── flow/ │ │ │ │ │ ├── Flow.java │ │ │ │ │ ├── IntermediateExecutionFlow.java │ │ │ │ │ ├── PhysicalExecutionFlow.java │ │ │ │ │ └── UnknownFlowException.java │ │ │ │ ├── diagnostic/ │ │ │ │ │ ├── PendingClusterSnapshot.java │ │ │ │ │ ├── PendingDiagnosticsCollector.java │ │ │ │ │ ├── PendingJobDiagnostic.java │ │ │ │ │ ├── PendingJobsResponse.java │ │ │ │ │ ├── PendingPipelineDiagnostic.java │ │ │ │ │ ├── PendingQueueSummary.java │ │ │ │ │ ├── PendingTaskGroupDiagnostic.java │ │ │ │ │ └── WorkerResourceDiagnostic.java │ │ │ │ ├── event/ │ │ │ │ │ ├── JobEventHttpReportHandler.java │ │ │ │ │ ├── JobEventListener.java │ │ │ │ │ ├── JobEventProcessor.java │ │ │ │ │ └── JobEventReportOperation.java │ │ │ │ ├── exception/ │ │ │ │ │ └── TaskGroupContextNotFoundException.java │ │ │ │ ├── execution/ │ │ │ │ │ ├── ExecutionState.java │ │ │ │ │ ├── PendingJobInfo.java │ │ │ │ │ ├── PendingSourceState.java │ │ │ │ │ ├── ProgressState.java │ │ │ │ │ ├── Task.java │ │ │ │ │ ├── TaskCallTimer.java │ │ │ │ │ ├── TaskDeployState.java │ │ │ │ │ ├── TaskExecutionContext.java │ │ │ │ │ ├── TaskExecutionState.java │ │ │ │ │ ├── TaskGroup.java │ │ │ │ │ ├── TaskGroupContext.java │ │ │ │ │ ├── TaskGroupDefaultImpl.java │ │ │ │ │ ├── TaskGroupLocation.java │ │ │ │ │ ├── TaskGroupType.java │ │ │ │ │ ├── TaskGroupUtils.java │ │ │ │ │ ├── TaskLocation.java │ │ │ │ │ └── TaskTracker.java │ │ │ │ ├── joiner/ │ │ │ │ │ ├── LiteNodeDropOutDiscoveryJoiner.java │ │ │ │ │ ├── LiteNodeDropOutMulticastJoiner.java │ │ │ │ │ └── LiteNodeDropOutTcpIpJoiner.java │ │ │ │ ├── log/ │ │ │ │ │ ├── FormatType.java │ │ │ │ │ ├── Log4j2HttpGetCommandProcessor.java │ │ │ │ │ └── Log4j2HttpPostCommandProcessor.java │ │ │ │ ├── master/ │ │ │ │ │ ├── JobHistoryService.java │ │ │ │ │ ├── JobMaster.java │ │ │ │ │ └── cleanup/ │ │ │ │ │ └── PipelineCleanupRecord.java │ │ │ │ ├── metrics/ │ │ │ │ │ ├── ConnectorMetricsCalcContext.java │ │ │ │ │ ├── JobMetricsCollector.java │ │ │ │ │ ├── JobMetricsUtil.java │ │ │ │ │ ├── SeaTunnelMetricsContext.java │ │ │ │ │ └── ZetaMetricsCollector.java │ │ │ │ ├── operation/ │ │ │ │ │ ├── AbstractJobAsyncOperation.java │ │ │ │ │ ├── AsyncOperation.java │ │ │ │ │ ├── CancelJobOperation.java │ │ │ │ │ ├── GetCheckpointHistoryOperation.java │ │ │ │ │ ├── GetCheckpointOverviewOperation.java │ │ │ │ │ ├── GetClusterHealthMetricsOperation.java │ │ │ │ │ ├── GetJobCheckpointOperation.java │ │ │ │ │ ├── GetJobDetailStatusOperation.java │ │ │ │ │ ├── GetJobInfoOperation.java │ │ │ │ │ ├── GetJobMetricsOperation.java │ │ │ │ │ ├── GetJobStatusOperation.java │ │ │ │ │ ├── GetRunningJobMetricsOperation.java │ │ │ │ │ ├── ListJobStatusOperation.java │ │ │ │ │ ├── PrintMessageOperation.java │ │ │ │ │ ├── SavePointJobOperation.java │ │ │ │ │ ├── SubmitJobOperation.java │ │ │ │ │ ├── UploadConnectorJarOperation.java │ │ │ │ │ └── WaitForJobCompleteOperation.java │ │ │ │ ├── persistence/ │ │ │ │ │ ├── FileMapStore.java │ │ │ │ │ └── FileMapStoreFactory.java │ │ │ │ ├── protocol/ │ │ │ │ │ └── task/ │ │ │ │ │ ├── AbstractSeaTunnelMessageTask.java │ │ │ │ │ ├── CancelJobTask.java │ │ │ │ │ ├── GetCheckpointHistoryTask.java │ │ │ │ │ ├── GetCheckpointOverviewTask.java │ │ │ │ │ ├── GetClusterHealthMetricsTask.java │ │ │ │ │ ├── GetJobCheckpointTask.java │ │ │ │ │ ├── GetJobDetailStatusTask.java │ │ │ │ │ ├── GetJobInfoTask.java │ │ │ │ │ ├── GetJobMetricsTask.java │ │ │ │ │ ├── GetJobStatusTask.java │ │ │ │ │ ├── GetRunningJobMetricsTask.java │ │ │ │ │ ├── ListJobStatusTask.java │ │ │ │ │ ├── PrintMessageTask.java │ │ │ │ │ ├── SavePointJobTask.java │ │ │ │ │ ├── SeaTunnelMessageTaskFactoryProvider.java │ │ │ │ │ ├── SubmitJobTask.java │ │ │ │ │ ├── UploadConnectorJarTask.java │ │ │ │ │ └── WaitForJobCompleteTask.java │ │ │ │ ├── resourcemanager/ │ │ │ │ │ ├── AbstractResourceManager.java │ │ │ │ │ ├── NoEnoughResourceException.java │ │ │ │ │ ├── ResourceManager.java │ │ │ │ │ ├── ResourceManagerFactory.java │ │ │ │ │ ├── ResourceRequestHandler.java │ │ │ │ │ ├── StandaloneResourceManager.java │ │ │ │ │ ├── UnsupportedDeployTypeException.java │ │ │ │ │ ├── allocation/ │ │ │ │ │ │ └── strategy/ │ │ │ │ │ │ ├── RandomStrategy.java │ │ │ │ │ │ ├── SlotAllocationStrategy.java │ │ │ │ │ │ ├── SlotRatioStrategy.java │ │ │ │ │ │ └── SystemLoadStrategy.java │ │ │ │ │ ├── opeartion/ │ │ │ │ │ │ ├── GetOverviewOperation.java │ │ │ │ │ │ ├── GetPendingJobsOperation.java │ │ │ │ │ │ ├── ReleaseSlotOperation.java │ │ │ │ │ │ ├── RequestSlotOperation.java │ │ │ │ │ │ ├── ResetResourceOperation.java │ │ │ │ │ │ ├── SyncWorkerProfileOperation.java │ │ │ │ │ │ └── WorkerHeartbeatOperation.java │ │ │ │ │ ├── resource/ │ │ │ │ │ │ ├── CPU.java │ │ │ │ │ │ ├── Memory.java │ │ │ │ │ │ ├── OverviewInfo.java │ │ │ │ │ │ ├── Resource.java │ │ │ │ │ │ ├── ResourceProfile.java │ │ │ │ │ │ ├── SlotAssignedProfile.java │ │ │ │ │ │ ├── SlotProfile.java │ │ │ │ │ │ └── SystemLoadInfo.java │ │ │ │ │ ├── thirdparty/ │ │ │ │ │ │ ├── CreateWorkerResult.java │ │ │ │ │ │ ├── ThirdPartyResourceManager.java │ │ │ │ │ │ ├── kubernetes/ │ │ │ │ │ │ │ └── KubernetesResourceManager.java │ │ │ │ │ │ └── yarn/ │ │ │ │ │ │ └── YarnResourceManager.java │ │ │ │ │ └── worker/ │ │ │ │ │ └── WorkerProfile.java │ │ │ │ ├── rest/ │ │ │ │ │ ├── ConfigFormat.java │ │ │ │ │ ├── ErrResponse.java │ │ │ │ │ ├── RestConstant.java │ │ │ │ │ ├── RestHttpGetCommandProcessor.java │ │ │ │ │ ├── RestHttpPostCommandProcessor.java │ │ │ │ │ ├── RestJobExecutionEnvironment.java │ │ │ │ │ ├── filter/ │ │ │ │ │ │ ├── BasicAuthFilter.java │ │ │ │ │ │ └── ExceptionHandlingFilter.java │ │ │ │ │ ├── service/ │ │ │ │ │ │ ├── BaseLogService.java │ │ │ │ │ │ ├── BaseService.java │ │ │ │ │ │ ├── CheckpointMonitorRestService.java │ │ │ │ │ │ ├── EncryptConfigService.java │ │ │ │ │ │ ├── JobInfoService.java │ │ │ │ │ │ ├── LogService.java │ │ │ │ │ │ ├── OverviewService.java │ │ │ │ │ │ ├── PendingJobsService.java │ │ │ │ │ │ ├── RunningThreadService.java │ │ │ │ │ │ ├── SystemMonitoringService.java │ │ │ │ │ │ ├── ThreadDumpService.java │ │ │ │ │ │ └── UpdateTagsService.java │ │ │ │ │ └── servlet/ │ │ │ │ │ ├── AllLogNameServlet.java │ │ │ │ │ ├── AllNodeLogServlet.java │ │ │ │ │ ├── BaseServlet.java │ │ │ │ │ ├── CheckpointHistoryServlet.java │ │ │ │ │ ├── CheckpointOverviewServlet.java │ │ │ │ │ ├── CurrentNodeLogServlet.java │ │ │ │ │ ├── EncryptConfigServlet.java │ │ │ │ │ ├── FinishedJobsServlet.java │ │ │ │ │ ├── JobInfoServlet.java │ │ │ │ │ ├── LogBaseServlet.java │ │ │ │ │ ├── MetricsServlet.java │ │ │ │ │ ├── OverviewServlet.java │ │ │ │ │ ├── PageBaseServlet.java │ │ │ │ │ ├── PendingJobsServlet.java │ │ │ │ │ ├── RunningJobsServlet.java │ │ │ │ │ ├── RunningThreadsServlet.java │ │ │ │ │ ├── StopJobServlet.java │ │ │ │ │ ├── StopJobsServlet.java │ │ │ │ │ ├── SubmitJobByUploadFileServlet.java │ │ │ │ │ ├── SubmitJobServlet.java │ │ │ │ │ ├── SubmitJobsServlet.java │ │ │ │ │ ├── SystemMonitoringServlet.java │ │ │ │ │ ├── ThreadDumpServlet.java │ │ │ │ │ └── UpdateTagsServlet.java │ │ │ │ ├── serializable/ │ │ │ │ │ ├── CheckpointDataSerializerHook.java │ │ │ │ │ ├── ClientToServerOperationDataSerializerHook.java │ │ │ │ │ ├── RecordSerializer.java │ │ │ │ │ ├── RecordSerializerHook.java │ │ │ │ │ ├── ResourceDataSerializerHook.java │ │ │ │ │ ├── TaskDataSerializerHook.java │ │ │ │ │ └── TypeId.java │ │ │ │ ├── service/ │ │ │ │ │ ├── jar/ │ │ │ │ │ │ ├── AbstractConnectorJarStorageStrategy.java │ │ │ │ │ │ ├── ConnectorJarStorageStrategy.java │ │ │ │ │ │ ├── ConnectorPackageService.java │ │ │ │ │ │ ├── IsolatedConnectorJarStorageStrategy.java │ │ │ │ │ │ ├── ServerConnectorPackageClient.java │ │ │ │ │ │ ├── SharedConnectorJarCleanupTask.java │ │ │ │ │ │ ├── SharedConnectorJarStorageStrategy.java │ │ │ │ │ │ └── StorageStrategyFactory.java │ │ │ │ │ └── slot/ │ │ │ │ │ ├── DefaultSlotService.java │ │ │ │ │ ├── SlotAndWorkerProfile.java │ │ │ │ │ ├── SlotContext.java │ │ │ │ │ ├── SlotService.java │ │ │ │ │ └── WrongTargetSlotException.java │ │ │ │ ├── task/ │ │ │ │ │ ├── AbstractTask.java │ │ │ │ │ ├── CoordinatorTask.java │ │ │ │ │ ├── Progress.java │ │ │ │ │ ├── SeaTunnelSourceCollector.java │ │ │ │ │ ├── SeaTunnelTask.java │ │ │ │ │ ├── SeaTunnelTransformCollector.java │ │ │ │ │ ├── SinkAggregatedCommitterTask.java │ │ │ │ │ ├── SourceSeaTunnelTask.java │ │ │ │ │ ├── SourceSplitEnumeratorTask.java │ │ │ │ │ ├── TaskGroupImmutableInformation.java │ │ │ │ │ ├── TaskRuntimeException.java │ │ │ │ │ ├── TransformSeaTunnelTask.java │ │ │ │ │ ├── context/ │ │ │ │ │ │ ├── SeaTunnelSplitEnumeratorContext.java │ │ │ │ │ │ ├── SinkWriterContext.java │ │ │ │ │ │ └── SourceReaderContext.java │ │ │ │ │ ├── flow/ │ │ │ │ │ │ ├── AbstractFlowLifeCycle.java │ │ │ │ │ │ ├── ActionFlowLifeCycle.java │ │ │ │ │ │ ├── FlowLifeCycle.java │ │ │ │ │ │ ├── IntermediateQueueFlowLifeCycle.java │ │ │ │ │ │ ├── OneInputFlowLifeCycle.java │ │ │ │ │ │ ├── OneOutputFlowLifeCycle.java │ │ │ │ │ │ ├── SinkFlowLifeCycle.java │ │ │ │ │ │ ├── SourceFlowLifeCycle.java │ │ │ │ │ │ └── TransformFlowLifeCycle.java │ │ │ │ │ ├── group/ │ │ │ │ │ │ ├── AbstractTaskGroupWithIntermediateQueue.java │ │ │ │ │ │ ├── TaskGroupWithIntermediateBlockingQueue.java │ │ │ │ │ │ ├── TaskGroupWithIntermediateDisruptor.java │ │ │ │ │ │ └── queue/ │ │ │ │ │ │ ├── AbstractIntermediateQueue.java │ │ │ │ │ │ ├── IntermediateBlockingQueue.java │ │ │ │ │ │ ├── IntermediateDisruptor.java │ │ │ │ │ │ └── disruptor/ │ │ │ │ │ │ ├── RecordEvent.java │ │ │ │ │ │ ├── RecordEventFactory.java │ │ │ │ │ │ ├── RecordEventHandler.java │ │ │ │ │ │ └── RecordEventProducer.java │ │ │ │ │ ├── operation/ │ │ │ │ │ │ ├── CancelTaskOperation.java │ │ │ │ │ │ ├── CheckTaskGroupIsExecutingOperation.java │ │ │ │ │ │ ├── CleanTaskGroupContextOperation.java │ │ │ │ │ │ ├── DeleteConnectorJarInExecutionNode.java │ │ │ │ │ │ ├── DeployTaskOperation.java │ │ │ │ │ │ ├── GetMetricsOperation.java │ │ │ │ │ │ ├── GetTaskGroupAddressOperation.java │ │ │ │ │ │ ├── GetTaskGroupMetricsOperation.java │ │ │ │ │ │ ├── NotifyTaskStatusOperation.java │ │ │ │ │ │ ├── ReportMetricsOperation.java │ │ │ │ │ │ ├── SendConnectorJarToMemberNodeOperation.java │ │ │ │ │ │ ├── TaskOperation.java │ │ │ │ │ │ ├── TracingOperation.java │ │ │ │ │ │ ├── checkpoint/ │ │ │ │ │ │ │ ├── BarrierFlowOperation.java │ │ │ │ │ │ │ └── CloseRequestOperation.java │ │ │ │ │ │ ├── sink/ │ │ │ │ │ │ │ ├── SinkPrepareCommitOperation.java │ │ │ │ │ │ │ └── SinkRegisterOperation.java │ │ │ │ │ │ └── source/ │ │ │ │ │ │ ├── AssignSplitOperation.java │ │ │ │ │ │ ├── CloseIdleReaderOperation.java │ │ │ │ │ │ ├── LastCheckpointNotifyOperation.java │ │ │ │ │ │ ├── RequestSplitOperation.java │ │ │ │ │ │ ├── RestoredSplitOperation.java │ │ │ │ │ │ ├── SourceEventOperation.java │ │ │ │ │ │ ├── SourceNoMoreElementOperation.java │ │ │ │ │ │ ├── SourceReaderEventOperation.java │ │ │ │ │ │ └── SourceRegisterOperation.java │ │ │ │ │ ├── record/ │ │ │ │ │ │ └── Barrier.java │ │ │ │ │ └── statemachine/ │ │ │ │ │ └── SeaTunnelTaskState.java │ │ │ │ ├── telemetry/ │ │ │ │ │ ├── log/ │ │ │ │ │ │ ├── TaskLogManagerService.java │ │ │ │ │ │ └── operation/ │ │ │ │ │ │ └── CleanLogOperation.java │ │ │ │ │ └── metrics/ │ │ │ │ │ ├── AbstractCollector.java │ │ │ │ │ ├── ExportsInstanceInitializer.java │ │ │ │ │ ├── entity/ │ │ │ │ │ │ ├── JobCounter.java │ │ │ │ │ │ └── ThreadPoolStatus.java │ │ │ │ │ └── exports/ │ │ │ │ │ ├── ClusterMetricExports.java │ │ │ │ │ ├── JobMetricExports.java │ │ │ │ │ ├── JobThreadPoolStatusExports.java │ │ │ │ │ └── NodeMetricExports.java │ │ │ │ └── utils/ │ │ │ │ ├── NodeEngineUtil.java │ │ │ │ ├── PeekBlockingQueue.java │ │ │ │ ├── RestUtil.java │ │ │ │ └── SystemLoadCalculate.java │ │ │ └── resources/ │ │ │ ├── META-INF/ │ │ │ │ └── services/ │ │ │ │ ├── com.hazelcast.DataSerializerHook │ │ │ │ ├── com.hazelcast.SerializerHook │ │ │ │ └── com.hazelcast.client.impl.protocol.MessageTaskFactoryProvider │ │ │ └── hazelcast-client.yaml │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── server/ │ │ │ ├── AbstractSeaTunnelServerTest.java │ │ │ ├── ConnectorPackageServiceTest.java │ │ │ ├── CoordinatorServicePipelineCleanupTest.java │ │ │ ├── CoordinatorServiceTest.java │ │ │ ├── CoordinatorServiceWithCancelPendingJobTest.java │ │ │ ├── TaskExecutionServiceTest.java │ │ │ ├── TestUtils.java │ │ │ ├── checkpoint/ │ │ │ │ ├── CheckpointCoordinatorTest.java │ │ │ │ ├── CheckpointErrorRestoreEndTest.java │ │ │ │ ├── CheckpointManagerTest.java │ │ │ │ ├── CheckpointPlanTest.java │ │ │ │ ├── CheckpointSerializeTest.java │ │ │ │ ├── CheckpointStorageTest.java │ │ │ │ ├── CheckpointTimeOutTest.java │ │ │ │ ├── SavePointTest.java │ │ │ │ └── StorageTest.java │ │ │ ├── dag/ │ │ │ │ └── TaskTest.java │ │ │ ├── diagnostic/ │ │ │ │ └── PendingDiagnosticsCollectorTest.java │ │ │ ├── event/ │ │ │ │ ├── JobEventHttpReportHandlerTest.java │ │ │ │ └── JobStateEventTest.java │ │ │ ├── execution/ │ │ │ │ ├── BlockTask.java │ │ │ │ ├── ExceptionTestTask.java │ │ │ │ ├── FixedCallTestTimeTask.java │ │ │ │ ├── StopTimeTestTask.java │ │ │ │ └── TestTask.java │ │ │ ├── master/ │ │ │ │ ├── JobHistoryServiceTest.java │ │ │ │ ├── JobMasterTest.java │ │ │ │ ├── JobMetricsTest.java │ │ │ │ └── cleanup/ │ │ │ │ ├── PipelineCleanupRecordHazelcastSerializationTest.java │ │ │ │ └── PipelineCleanupRecordTest.java │ │ │ ├── metrics/ │ │ │ │ ├── ConnectorMetricsCalcContextTest.java │ │ │ │ └── MetricsApiTest.java │ │ │ ├── operation/ │ │ │ │ ├── ReturnRetryTimesOperation.java │ │ │ │ └── TestSerializerHook.java │ │ │ ├── resourcemanager/ │ │ │ │ ├── FakeResourceManager.java │ │ │ │ ├── FakeResourceManagerForRequestSlotRetryTest.java │ │ │ │ ├── FixSlotResourceTest.java │ │ │ │ ├── ResourceManagerTest.java │ │ │ │ └── WorkerTagTest.java │ │ │ ├── rest/ │ │ │ │ ├── BaseServletTest.java │ │ │ │ ├── RestApiHttpBasicTest.java │ │ │ │ ├── RestApiHttpsForTruststoreTest.java │ │ │ │ ├── RestApiHttpsTest.java │ │ │ │ ├── RestApiRequestCallback.java │ │ │ │ ├── RestApiSubmitJobStartWithSavePointTest.java │ │ │ │ ├── SSLUtils.java │ │ │ │ └── service/ │ │ │ │ ├── BaseServiceNullSafetyTest.java │ │ │ │ └── BaseServiceTableMetricsTest.java │ │ │ ├── task/ │ │ │ │ ├── SinkAggregatedCommitterTaskTest.java │ │ │ │ └── SourceSplitEnumeratorTaskTest.java │ │ │ └── utils/ │ │ │ ├── PeekBlockingQueueTest.java │ │ │ └── SystemLoadCalculateTest.java │ │ └── resources/ │ │ ├── batch_fake_to_console.conf │ │ ├── batch_fake_to_console_without_checkpoint_interval.conf │ │ ├── batch_fake_to_inmemory.conf │ │ ├── batch_fakesource_to_file.conf │ │ ├── batch_fakesource_to_file_complex.conf │ │ ├── batch_fakesource_to_file_with_checkpoint.conf │ │ ├── batch_fakesource_to_inmemory_with_commit_error.conf │ │ ├── batch_slot_not_enough.conf │ │ ├── cancel_pending_job.conf │ │ ├── fake_to_console.conf │ │ ├── fake_to_console_job_metrics.conf │ │ ├── hazelcast-client.yaml │ │ ├── hazelcast.yaml │ │ ├── https/ │ │ │ ├── client_keystore.jks │ │ │ ├── client_truststore.jks │ │ │ ├── server_keystore.jks │ │ │ └── server_truststore.jks │ │ ├── log4j2-test.properties │ │ ├── mockito-extensions/ │ │ │ └── org.mockito.plugins.MockMaker │ │ ├── seatunnel.yaml │ │ ├── seatunnel_fixed_slots.yaml │ │ ├── seatunnel_multiple_metrics_key.yaml │ │ ├── stream_fake_to_console.conf │ │ ├── stream_fake_to_console_biginterval.conf │ │ ├── stream_fake_to_console_checkpointTimeOut.conf │ │ ├── stream_fake_to_console_with_checkpoint.conf │ │ ├── stream_fake_to_inmemory_with_error.conf │ │ ├── stream_fake_to_inmemory_with_sleep.conf │ │ ├── stream_fakesource_to_file.conf │ │ └── stream_fakesource_to_file_savepoint.conf │ ├── seatunnel-engine-storage/ │ │ ├── checkpoint-storage-api/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── checkpoint/ │ │ │ └── storage/ │ │ │ ├── PipelineState.java │ │ │ ├── api/ │ │ │ │ ├── AbstractCheckpointStorage.java │ │ │ │ ├── CheckpointStorage.java │ │ │ │ └── CheckpointStorageFactory.java │ │ │ ├── common/ │ │ │ │ └── StorageThreadFactory.java │ │ │ ├── constants/ │ │ │ │ └── StorageConstants.java │ │ │ └── exception/ │ │ │ └── CheckpointStorageException.java │ │ ├── checkpoint-storage-plugins/ │ │ │ ├── checkpoint-storage-hdfs/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ ├── main/ │ │ │ │ │ └── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── engine/ │ │ │ │ │ └── checkpoint/ │ │ │ │ │ └── storage/ │ │ │ │ │ └── hdfs/ │ │ │ │ │ ├── HdfsStorage.java │ │ │ │ │ ├── HdfsStorageFactory.java │ │ │ │ │ └── common/ │ │ │ │ │ ├── AbstractConfiguration.java │ │ │ │ │ ├── CosConfiguration.java │ │ │ │ │ ├── FileConfiguration.java │ │ │ │ │ ├── HdfsConfiguration.java │ │ │ │ │ ├── HdfsFileStorageInstance.java │ │ │ │ │ ├── LocalConfiguration.java │ │ │ │ │ ├── OssConfiguration.java │ │ │ │ │ └── S3Configuration.java │ │ │ │ └── test/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── checkpoint/ │ │ │ │ └── storage/ │ │ │ │ └── hdfs/ │ │ │ │ ├── AbstractFileCheckPointTest.java │ │ │ │ ├── HDFSFileCheckpointTest.java │ │ │ │ ├── LocalFileCheckPointTest.java │ │ │ │ ├── OssFileCheckpointTest.java │ │ │ │ └── S3FileCheckpointTest.java │ │ │ ├── checkpoint-storage-local-file/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ ├── main/ │ │ │ │ │ └── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── engine/ │ │ │ │ │ └── checkpoint/ │ │ │ │ │ └── storage/ │ │ │ │ │ └── localfile/ │ │ │ │ │ ├── LocalFileStorage.java │ │ │ │ │ └── LocalFileStorageFactory.java │ │ │ │ └── test/ │ │ │ │ ├── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── engine/ │ │ │ │ │ └── checkpoint/ │ │ │ │ │ └── storage/ │ │ │ │ │ └── localfile/ │ │ │ │ │ └── LocalFileStorageTest.java │ │ │ │ └── resources/ │ │ │ │ └── log4j2-test.properties │ │ │ └── pom.xml │ │ ├── imap-storage-api/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── engine/ │ │ │ └── imap/ │ │ │ └── storage/ │ │ │ └── api/ │ │ │ ├── IMapStorage.java │ │ │ ├── IMapStorageFactory.java │ │ │ └── exception/ │ │ │ └── IMapStorageException.java │ │ ├── imap-storage-plugins/ │ │ │ ├── imap-storage-file/ │ │ │ │ ├── pom.xml │ │ │ │ └── src/ │ │ │ │ ├── main/ │ │ │ │ │ └── java/ │ │ │ │ │ └── org/ │ │ │ │ │ └── apache/ │ │ │ │ │ └── seatunnel/ │ │ │ │ │ └── engine/ │ │ │ │ │ └── imap/ │ │ │ │ │ └── storage/ │ │ │ │ │ └── file/ │ │ │ │ │ ├── IMapFileStorage.java │ │ │ │ │ ├── IMapFileStorageFactory.java │ │ │ │ │ ├── bean/ │ │ │ │ │ │ ├── IMapData.java │ │ │ │ │ │ └── IMapFileData.java │ │ │ │ │ ├── common/ │ │ │ │ │ │ ├── FileConstants.java │ │ │ │ │ │ ├── WALDataUtils.java │ │ │ │ │ │ ├── WALReader.java │ │ │ │ │ │ └── WALWriter.java │ │ │ │ │ ├── config/ │ │ │ │ │ │ ├── AbstractConfiguration.java │ │ │ │ │ │ ├── FileConfiguration.java │ │ │ │ │ │ ├── HdfsConfiguration.java │ │ │ │ │ │ ├── OssConfiguration.java │ │ │ │ │ │ └── S3Configuration.java │ │ │ │ │ ├── disruptor/ │ │ │ │ │ │ ├── FileWALEvent.java │ │ │ │ │ │ ├── WALDisruptor.java │ │ │ │ │ │ ├── WALEventType.java │ │ │ │ │ │ └── WALWorkHandler.java │ │ │ │ │ ├── future/ │ │ │ │ │ │ ├── RequestFuture.java │ │ │ │ │ │ └── RequestFutureCache.java │ │ │ │ │ ├── scheduler/ │ │ │ │ │ │ └── SchedulerTaskInfo.java │ │ │ │ │ └── wal/ │ │ │ │ │ ├── DiscoveryWalFileFactory.java │ │ │ │ │ ├── reader/ │ │ │ │ │ │ ├── DefaultReader.java │ │ │ │ │ │ └── IFileReader.java │ │ │ │ │ └── writer/ │ │ │ │ │ ├── CloudWriter.java │ │ │ │ │ ├── HdfsWriter.java │ │ │ │ │ ├── IFileWriter.java │ │ │ │ │ ├── OssWriter.java │ │ │ │ │ └── S3Writer.java │ │ │ │ └── test/ │ │ │ │ └── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── engine/ │ │ │ │ └── imap/ │ │ │ │ └── storage/ │ │ │ │ └── file/ │ │ │ │ ├── IMapFileOSSStorageTest.java │ │ │ │ ├── IMapFileStorageTest.java │ │ │ │ ├── common/ │ │ │ │ │ └── WALReaderAndWriterTest.java │ │ │ │ └── disruptor/ │ │ │ │ └── WALDisruptorTest.java │ │ │ └── pom.xml │ │ └── pom.xml │ └── seatunnel-engine-ui/ │ ├── .eslintrc.cjs │ ├── .gitignore │ ├── .prettierrc.json │ ├── README.md │ ├── cypress/ │ │ ├── e2e/ │ │ │ ├── example.cy.ts │ │ │ └── tsconfig.json │ │ ├── fixtures/ │ │ │ └── example.json │ │ └── support/ │ │ ├── commands.ts │ │ └── e2e.ts │ ├── cypress.config.ts │ ├── env.d.ts │ ├── index.html │ ├── package.json │ ├── pom.xml │ ├── postcss.config.js │ ├── src/ │ │ ├── App.tsx │ │ ├── assets/ │ │ │ ├── main.scss │ │ │ ├── style.scss │ │ │ └── tailwind.scss │ │ ├── components/ │ │ │ ├── configuration/ │ │ │ │ └── index.tsx │ │ │ ├── directed-acyclic-graph/ │ │ │ │ ├── index.scss │ │ │ │ └── index.tsx │ │ │ └── job-log/ │ │ │ └── index.tsx │ │ ├── layouts/ │ │ │ └── main/ │ │ │ ├── header/ │ │ │ │ ├── index.tsx │ │ │ │ ├── info/ │ │ │ │ │ └── index.tsx │ │ │ │ └── logo/ │ │ │ │ └── index.tsx │ │ │ ├── index.tsx │ │ │ └── sidebar/ │ │ │ ├── index.module.scss │ │ │ └── index.tsx │ │ ├── locales/ │ │ │ ├── en_US/ │ │ │ │ ├── common.ts │ │ │ │ ├── detail.ts │ │ │ │ ├── index.ts │ │ │ │ ├── jobs.ts │ │ │ │ ├── managers.ts │ │ │ │ └── menu.ts │ │ │ ├── index.ts │ │ │ └── zh_CN/ │ │ │ ├── common.ts │ │ │ ├── detail.ts │ │ │ ├── index.ts │ │ │ ├── jobs.ts │ │ │ ├── managers.ts │ │ │ └── menu.ts │ │ ├── main.ts │ │ ├── router/ │ │ │ ├── index.ts │ │ │ └── routes.ts │ │ ├── service/ │ │ │ ├── job/ │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── job-log/ │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── manager/ │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── overview/ │ │ │ │ ├── index.ts │ │ │ │ └── types.ts │ │ │ ├── service.ts │ │ │ └── types.ts │ │ ├── store/ │ │ │ ├── counter.ts │ │ │ └── setting/ │ │ │ ├── index.ts │ │ │ └── types.ts │ │ ├── tests/ │ │ │ ├── jobs.spec.ts │ │ │ ├── managers.spec.ts │ │ │ ├── overview.spec.ts │ │ │ ├── remain-time.spec.ts │ │ │ └── setting.spec.ts │ │ ├── utils/ │ │ │ ├── getTypeFromStatus.ts │ │ │ ├── log.ts │ │ │ └── time.ts │ │ └── views/ │ │ ├── jobs/ │ │ │ ├── detail.scss │ │ │ ├── detail.tsx │ │ │ ├── finished-jobs.tsx │ │ │ ├── index.tsx │ │ │ └── running-jobs.tsx │ │ ├── managers/ │ │ │ └── index.tsx │ │ └── overview/ │ │ ├── baseInfo.tsx │ │ └── index.tsx │ ├── tailwind.config.js │ ├── tsconfig.app.json │ ├── tsconfig.json │ ├── tsconfig.node.json │ ├── tsconfig.vitest.json │ ├── vite.config.ts │ └── vitest.config.ts ├── seatunnel-formats/ │ ├── pom.xml │ ├── seatunnel-format-avro/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── avro/ │ │ │ ├── AvroDeserializationSchema.java │ │ │ ├── AvroSerializationSchema.java │ │ │ ├── AvroToRowConverter.java │ │ │ ├── RowToAvroConverter.java │ │ │ ├── SeaTunnelRowTypeToAvroSchemaConverter.java │ │ │ └── exception/ │ │ │ ├── AvroFormatErrorCode.java │ │ │ └── SeaTunnelAvroFormatException.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── format/ │ │ └── avro/ │ │ ├── AvroConverterTest.java │ │ └── AvroSerializationSchemaTest.java │ ├── seatunnel-format-compatible-connect-json/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── format/ │ │ └── compatible/ │ │ └── kafka/ │ │ └── connect/ │ │ └── json/ │ │ ├── CompatibleKafkaConnectDeserializationSchema.java │ │ ├── KafkaConnectJsonFormatOptions.java │ │ └── NativeKafkaConnectDeserializationSchema.java │ ├── seatunnel-format-compatible-debezium-json/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── compatible/ │ │ │ └── debezium/ │ │ │ └── json/ │ │ │ ├── CompatibleDebeziumJsonDeserializationSchema.java │ │ │ ├── CompatibleDebeziumJsonSerializationSchema.java │ │ │ └── DebeziumJsonConverter.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── format/ │ │ └── compatible/ │ │ └── debezium/ │ │ └── json/ │ │ ├── TestCompatibleDebeziumJsonDeserializationSchema.java │ │ ├── TestCompatibleDebeziumJsonSerializationSchema.java │ │ └── TestDebeziumJsonConverter.java │ ├── seatunnel-format-csv/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── csv/ │ │ │ ├── CsvDeserializationSchema.java │ │ │ ├── CsvSerializationSchema.java │ │ │ ├── constant/ │ │ │ │ ├── CsvFormatConstant.java │ │ │ │ └── CsvStringQuoteMode.java │ │ │ ├── exception/ │ │ │ │ └── SeaTunnelCsvFormatException.java │ │ │ └── processor/ │ │ │ ├── CsvLineProcessor.java │ │ │ └── DefaultCsvLineProcessor.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── csv/ │ │ │ ├── CsvTextFormatSchemaTest.java │ │ │ └── processor/ │ │ │ └── CsvLineProcessorTest.java │ │ └── resources/ │ │ └── testdata.csv │ ├── seatunnel-format-json/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── json/ │ │ │ ├── JsonDeserializationSchema.java │ │ │ ├── JsonFormatOptions.java │ │ │ ├── JsonSerializationSchema.java │ │ │ ├── JsonToRowConverters.java │ │ │ ├── RowToJsonConverters.java │ │ │ ├── TimeFormat.java │ │ │ ├── canal/ │ │ │ │ ├── CanalJsonDeserializationSchema.java │ │ │ │ ├── CanalJsonFormatOptions.java │ │ │ │ └── CanalJsonSerializationSchema.java │ │ │ ├── debezium/ │ │ │ │ ├── DebeziumJsonDeserializationSchema.java │ │ │ │ ├── DebeziumJsonDeserializationSchemaDispatcher.java │ │ │ │ ├── DebeziumJsonFormatOptions.java │ │ │ │ ├── DebeziumJsonSerializationSchema.java │ │ │ │ └── DebeziumRowConverter.java │ │ │ ├── exception/ │ │ │ │ └── SeaTunnelJsonFormatException.java │ │ │ ├── maxwell/ │ │ │ │ ├── MaxWellJsonDeserializationSchema.java │ │ │ │ ├── MaxWellJsonFormatOptions.java │ │ │ │ └── MaxWellJsonSerializationSchema.java │ │ │ └── ogg/ │ │ │ ├── OggJsonDeserializationSchema.java │ │ │ ├── OggJsonFormatOptions.java │ │ │ └── OggJsonSerializationSchema.java │ │ └── test/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── json/ │ │ │ ├── JsonRowDataSerDeSchemaTest.java │ │ │ ├── canal/ │ │ │ │ └── CanalJsonSerDeSchemaTest.java │ │ │ ├── debezium/ │ │ │ │ ├── DebeziumJsonDeserializationSchemaDispatcherTest.java │ │ │ │ └── DebeziumJsonSerDeSchemaTest.java │ │ │ ├── maxwell/ │ │ │ │ └── MaxWellJsonSerDeSchemaTest.java │ │ │ └── ogg/ │ │ │ └── OggJsonSerDeSchemaTest.java │ │ └── resources/ │ │ ├── canal-data-filter-table.txt │ │ ├── debezium-data.txt │ │ ├── debezium-mysql.txt │ │ ├── debezium-oracle.txt │ │ ├── debezium-postgresql.txt │ │ ├── debezium-sqlserver.txt │ │ ├── maxwell-data-filter-table.txt │ │ └── ogg-data-filter-table.txt │ ├── seatunnel-format-protobuf/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── format/ │ │ │ └── protobuf/ │ │ │ ├── CompileDescriptor.java │ │ │ ├── ProtobufDeserializationSchema.java │ │ │ ├── ProtobufSerializationSchema.java │ │ │ ├── ProtobufToRowConverter.java │ │ │ ├── RowToProtobufConverter.java │ │ │ ├── SchemaRegistryAwareProtobufDeserializationSchema.java │ │ │ └── exception/ │ │ │ ├── ProtobufFormatErrorCode.java │ │ │ └── SeaTunnelProtobufFormatException.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── format/ │ │ └── protobuf/ │ │ ├── ProtobufConverterTest.java │ │ └── SchemaRegistryAwareProtobufDeserializationSchemaTest.java │ └── seatunnel-format-text/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── format/ │ │ └── text/ │ │ ├── TextDeserializationSchema.java │ │ ├── TextSerializationSchema.java │ │ ├── constant/ │ │ │ └── TextFormatConstant.java │ │ ├── exception/ │ │ │ └── SeaTunnelTextFormatException.java │ │ └── splitor/ │ │ ├── CsvLineSplitor.java │ │ ├── DefaultTextLineSplitor.java │ │ └── TextLineSplitor.java │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── format/ │ └── text/ │ ├── CsvTextFormatSchemaTest.java │ └── TextFormatSchemaTest.java ├── seatunnel-plugin-discovery/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── plugin/ │ │ └── discovery/ │ │ ├── AbstractPluginDiscovery.java │ │ ├── PluginDiscovery.java │ │ └── seatunnel/ │ │ ├── SeaTunnelFactoryDiscovery.java │ │ ├── SeaTunnelSinkPluginDiscovery.java │ │ ├── SeaTunnelSourcePluginDiscovery.java │ │ └── SeaTunnelTransformPluginDiscovery.java │ └── test/ │ ├── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── plugin/ │ │ └── discovery/ │ │ ├── AbstractPluginDiscoveryTest.java │ │ └── seatunnel/ │ │ └── SeaTunnelSourcePluginDiscoveryTest.java │ └── resources/ │ ├── duplicate/ │ │ └── connectors/ │ │ └── plugin-mapping.properties │ └── home/ │ └── connectors/ │ └── plugin-mapping.properties ├── seatunnel-shade/ │ ├── pom.xml │ ├── seatunnel-arrow/ │ │ └── pom.xml │ ├── seatunnel-commons-lang3/ │ │ └── pom.xml │ ├── seatunnel-guava/ │ │ └── pom.xml │ ├── seatunnel-hadoop-aws/ │ │ └── pom.xml │ ├── seatunnel-hadoop3-3.1.4-uber/ │ │ └── pom.xml │ ├── seatunnel-hazelcast/ │ │ ├── pom.xml │ │ ├── seatunnel-hazelcast-base/ │ │ │ └── pom.xml │ │ └── seatunnel-hazelcast-shade/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── java/ │ │ └── com/ │ │ └── hazelcast/ │ │ ├── cluster/ │ │ │ └── impl/ │ │ │ └── MemberImpl.java │ │ └── internal/ │ │ └── cluster/ │ │ └── impl/ │ │ ├── ClusterServiceImpl.java │ │ ├── MemberMap.java │ │ └── MembershipManager.java │ ├── seatunnel-hikari/ │ │ └── pom.xml │ ├── seatunnel-jackson/ │ │ └── pom.xml │ ├── seatunnel-janino/ │ │ └── pom.xml │ ├── seatunnel-jetty9-9.4.56/ │ │ └── pom.xml │ ├── seatunnel-scala-compiler/ │ │ └── pom.xml │ └── seatunnel-thrift-service/ │ └── pom.xml ├── seatunnel-transforms-v2/ │ ├── README.md │ ├── README.zh.md │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── transform/ │ │ ├── adaptsink/ │ │ │ ├── DefineSinkTypeMultiCatalogTransform.java │ │ │ ├── DefineSinkTypeTransform.java │ │ │ ├── DefineSinkTypeTransformConfig.java │ │ │ └── DefineSinkTypeTransformFactory.java │ │ ├── common/ │ │ │ ├── AbstractCatalogSupportFlatMapTransform.java │ │ │ ├── AbstractCatalogSupportMapTransform.java │ │ │ ├── AbstractMultiCatalogFlatMapTransform.java │ │ │ ├── AbstractMultiCatalogMapTransform.java │ │ │ ├── AbstractMultiCatalogTransform.java │ │ │ ├── AbstractSeaTunnelTransform.java │ │ │ ├── ErrorHandleWay.java │ │ │ ├── FilterRowTransform.java │ │ │ ├── IdentityFlatMapTransform.java │ │ │ ├── IdentityMapTransform.java │ │ │ ├── MultipleFieldOutputTransform.java │ │ │ ├── SeaTunnelRowAccessor.java │ │ │ ├── SeaTunnelRowContainerGenerator.java │ │ │ ├── SingleFieldOutputTransform.java │ │ │ └── TransformCommonOptions.java │ │ ├── copy/ │ │ │ ├── CopyFieldMultiCatalogTransform.java │ │ │ ├── CopyFieldTransform.java │ │ │ ├── CopyFieldTransformFactory.java │ │ │ └── CopyTransformConfig.java │ │ ├── dynamiccompile/ │ │ │ ├── CompileLanguage.java │ │ │ ├── CompilePattern.java │ │ │ ├── CompileTransformErrorCode.java │ │ │ ├── DynamicCompileMultiCatalogTransform.java │ │ │ ├── DynamicCompileTransform.java │ │ │ ├── DynamicCompileTransformConfig.java │ │ │ ├── DynamicCompileTransformFactory.java │ │ │ └── parse/ │ │ │ ├── AbstractParse.java │ │ │ ├── AbstractParser.java │ │ │ ├── GroovyClassParse.java │ │ │ ├── GroovyClassParser.java │ │ │ ├── JavaClassParse.java │ │ │ ├── JavaClassParser.java │ │ │ ├── ScalaClassParse.java │ │ │ └── ScalaClassParser.java │ │ ├── encrypt/ │ │ │ ├── FieldEncryptMultiCatalogTransform.java │ │ │ ├── FieldEncryptTransform.java │ │ │ ├── FieldEncryptTransformConfig.java │ │ │ ├── FieldEncryptTransformFactory.java │ │ │ └── encryptor/ │ │ │ ├── AbstractAesEncryptor.java │ │ │ ├── AesCbcEncryptor.java │ │ │ ├── AesGcmEncryptor.java │ │ │ └── Encryptor.java │ │ ├── exception/ │ │ │ ├── ErrorDataTransformException.java │ │ │ ├── JsonPathTransformErrorCode.java │ │ │ ├── TransformCommonError.java │ │ │ ├── TransformCommonErrorCode.java │ │ │ └── TransformException.java │ │ ├── fieldmapper/ │ │ │ ├── FieldMapperMultiCatalogTransform.java │ │ │ ├── FieldMapperTransform.java │ │ │ ├── FieldMapperTransformConfig.java │ │ │ └── FieldMapperTransformFactory.java │ │ ├── filter/ │ │ │ ├── FilterFieldMultiCatalogTransform.java │ │ │ ├── FilterFieldTransform.java │ │ │ ├── FilterFieldTransformConfig.java │ │ │ └── FilterFieldTransformFactory.java │ │ ├── filterrowkind/ │ │ │ ├── FieldRowKindMultiCatalogTransform.java │ │ │ ├── FilterRowKindTransform.java │ │ │ ├── FilterRowKindTransformFactory.java │ │ │ └── FilterRowKinkTransformConfig.java │ │ ├── jsonpath/ │ │ │ ├── ColumnConfig.java │ │ │ ├── JsonPathMultiCatalogTransform.java │ │ │ ├── JsonPathTransform.java │ │ │ ├── JsonPathTransformConfig.java │ │ │ └── JsonPathTransformFactory.java │ │ ├── metadata/ │ │ │ ├── MetadataMultiCatalogTransform.java │ │ │ ├── MetadataTransform.java │ │ │ ├── MetadataTransformConfig.java │ │ │ └── MetadataTransformFactory.java │ │ ├── nlpmodel/ │ │ │ ├── CustomConfigPlaceholder.java │ │ │ ├── ModelProvider.java │ │ │ ├── ModelTransformConfig.java │ │ │ ├── embedding/ │ │ │ │ ├── EmbeddingMultiCatalogTransform.java │ │ │ │ ├── EmbeddingTransform.java │ │ │ │ ├── EmbeddingTransformConfig.java │ │ │ │ ├── EmbeddingTransformFactory.java │ │ │ │ ├── FieldSpec.java │ │ │ │ ├── multimodal/ │ │ │ │ │ ├── ModalityType.java │ │ │ │ │ ├── MultimodalFieldValue.java │ │ │ │ │ ├── MultimodalModel.java │ │ │ │ │ └── PayloadFormat.java │ │ │ │ └── remote/ │ │ │ │ ├── AbstractModel.java │ │ │ │ ├── Model.java │ │ │ │ ├── amazon/ │ │ │ │ │ └── BedrockModel.java │ │ │ │ ├── custom/ │ │ │ │ │ └── CustomModel.java │ │ │ │ ├── doubao/ │ │ │ │ │ └── DoubaoModel.java │ │ │ │ ├── openai/ │ │ │ │ │ └── OpenAIModel.java │ │ │ │ ├── qianfan/ │ │ │ │ │ └── QianfanModel.java │ │ │ │ └── zhipu/ │ │ │ │ └── ZhipuModel.java │ │ │ └── llm/ │ │ │ ├── LLMMultiCatalogTransform.java │ │ │ ├── LLMTransform.java │ │ │ ├── LLMTransformConfig.java │ │ │ ├── LLMTransformFactory.java │ │ │ └── remote/ │ │ │ ├── AbstractModel.java │ │ │ ├── Model.java │ │ │ ├── custom/ │ │ │ │ └── CustomModel.java │ │ │ ├── kimiai/ │ │ │ │ └── KimiAIModel.java │ │ │ ├── microsoft/ │ │ │ │ └── MicrosoftModel.java │ │ │ └── openai/ │ │ │ └── OpenAIModel.java │ │ ├── regexextract/ │ │ │ ├── RegexExtractMultiCatalogTransform.java │ │ │ ├── RegexExtractTransform.java │ │ │ ├── RegexExtractTransformConfig.java │ │ │ ├── RegexExtractTransformErrorCode.java │ │ │ └── RegexExtractTransformFactory.java │ │ ├── rename/ │ │ │ ├── ConvertCase.java │ │ │ ├── FieldRenameConfig.java │ │ │ ├── FieldRenameMultiCatalogTransform.java │ │ │ ├── FieldRenameTransform.java │ │ │ ├── FieldRenameTransformFactory.java │ │ │ ├── TableRenameConfig.java │ │ │ ├── TableRenameMultiCatalogTransform.java │ │ │ ├── TableRenameTransform.java │ │ │ └── TableRenameTransformFactory.java │ │ ├── replace/ │ │ │ ├── ReplaceMultiCatalogTransform.java │ │ │ ├── ReplaceTransform.java │ │ │ ├── ReplaceTransformConfig.java │ │ │ └── ReplaceTransformFactory.java │ │ ├── rowkind/ │ │ │ ├── RowKindExtractorMultiCatalogTransform.java │ │ │ ├── RowKindExtractorTransform.java │ │ │ ├── RowKindExtractorTransformConfig.java │ │ │ ├── RowKindExtractorTransformFactory.java │ │ │ └── RowKindExtractorTransformType.java │ │ ├── split/ │ │ │ ├── SplitMultiCatalogTransform.java │ │ │ ├── SplitTransform.java │ │ │ ├── SplitTransformConfig.java │ │ │ └── SplitTransformFactory.java │ │ ├── sql/ │ │ │ ├── SQLEngine.java │ │ │ ├── SQLEngineFactory.java │ │ │ ├── SQLMultiCatalogFlatMapTransform.java │ │ │ ├── SQLTransform.java │ │ │ ├── SQLTransformFactory.java │ │ │ └── zeta/ │ │ │ ├── ZetaDateTimeFormat.java │ │ │ ├── ZetaSQLEngine.java │ │ │ ├── ZetaSQLFilter.java │ │ │ ├── ZetaSQLFunction.java │ │ │ ├── ZetaSQLType.java │ │ │ ├── ZetaUDF.java │ │ │ ├── ZetaUDFContext.java │ │ │ └── functions/ │ │ │ ├── ArrayFunction.java │ │ │ ├── CastFunction.java │ │ │ ├── CommonFunction.java │ │ │ ├── DateTimeFunction.java │ │ │ ├── MapFunction.java │ │ │ ├── NumericFunction.java │ │ │ ├── StringFunction.java │ │ │ ├── SystemFunction.java │ │ │ ├── VectorFunction.java │ │ │ └── udf/ │ │ │ ├── DESUtil.java │ │ │ ├── DesDecrypt.java │ │ │ └── DesEncrypt.java │ │ ├── table/ │ │ │ ├── TableFilterConfig.java │ │ │ ├── TableFilterMultiCatalogTransform.java │ │ │ ├── TableFilterTransform.java │ │ │ ├── TableFilterTransformFactory.java │ │ │ ├── TableMergeConfig.java │ │ │ ├── TableMergeMultiCatalogTransform.java │ │ │ ├── TableMergeTransform.java │ │ │ └── TableMergeTransformFactory.java │ │ └── validator/ │ │ ├── DataValidatorTransform.java │ │ ├── DataValidatorTransformConfig.java │ │ ├── DataValidatorTransformFactory.java │ │ ├── FieldValidator.java │ │ ├── ValidationContext.java │ │ ├── ValidationResult.java │ │ ├── ValidationResultHandler.java │ │ ├── rule/ │ │ │ ├── LengthValidationRule.java │ │ │ ├── NotNullValidationRule.java │ │ │ ├── RangeValidationRule.java │ │ │ ├── RegexValidationRule.java │ │ │ ├── UDFValidationRule.java │ │ │ └── ValidationRule.java │ │ └── udf/ │ │ ├── DataValidatorUDF.java │ │ └── EmailValidator.java │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── transform/ │ ├── CopyFieldTransformFactoryTest.java │ ├── EmbeddingTransformFactoryTest.java │ ├── FieldMapperTransformFactoryTest.java │ ├── FilterFieldTransformFactoryTest.java │ ├── FilterRowKindTransformFactoryTest.java │ ├── JsonPathTransformTest.java │ ├── LLMTransformFactoryTest.java │ ├── RegexExtractTransformFactoryTest.java │ ├── ReplaceTransformFactoryTest.java │ ├── RowKindExtractorTransformFactoryTest.java │ ├── SplitTransformFactoryTest.java │ ├── adaptsink/ │ │ └── DefineSinkTypeTransformTest.java │ ├── embedding/ │ │ ├── DoubaoMultimodalModelTest.java │ │ ├── EmbeddingModelDimensionTest.java │ │ ├── EmbeddingRequestJsonTest.java │ │ ├── EmbeddingTransformTest.java │ │ ├── EmbeddingVectorTest.java │ │ ├── FieldSpecTest.java │ │ └── MultimodalConfigTest.java │ ├── encrypt/ │ │ ├── FieldEncryptTransformTest.java │ │ └── encryptor/ │ │ └── AesGcmEncryptorTest.java │ ├── exception/ │ │ └── TransformErrorTest.java │ ├── fieldmapper/ │ │ └── FieldMapperTransformTest.java │ ├── filter/ │ │ └── FilterFieldTransformTest.java │ ├── llm/ │ │ └── LLMRequestJsonTest.java │ ├── metadata/ │ │ └── MetadataTransformTest.java │ ├── regexextract/ │ │ └── RegexExtractTransformTest.java │ ├── rename/ │ │ ├── FieldRenameMultiCatalogTransformTest.java │ │ ├── FieldRenameTransformTest.java │ │ └── TableRenameTransformTest.java │ ├── rowkind/ │ │ └── RowKindExtractorTransformTest.java │ ├── sql/ │ │ ├── SQLDateTimeFunctionsTest.java │ │ ├── SQLEngineFactoryTest.java │ │ ├── SQLHashFunctionsTest.java │ │ ├── SQLLateralViewFunctionsTest.java │ │ ├── SQLMultiCatalogFlatMapTransformTest.java │ │ ├── SQLNestedTypeTest.java │ │ ├── SQLNumericFunctionsTest.java │ │ ├── SQLStringFunctionsTest.java │ │ ├── SQLSystemFunctionsTest.java │ │ ├── SQLTransformFactoryTest.java │ │ ├── SQLTransformTest.java │ │ ├── SQLVectorFunctionTest.java │ │ └── zeta/ │ │ ├── ConcatWsFunctionTest.java │ │ ├── DateTimeFunctionTest.java │ │ ├── ExtractFunctionTest.java │ │ ├── NumericFunctionTest.java │ │ ├── ZetaDateTimeFormatTest.java │ │ ├── ZetaSQLEngineTest.java │ │ ├── ZetaSQLFilterTest.java │ │ ├── ZetaSQLFunctionTest.java │ │ ├── ZetaSQLTypeTest.java │ │ └── functions/ │ │ ├── ArrayFunctionTest.java │ │ ├── CastFunctionTest.java │ │ ├── CastFunctionTypeTest.java │ │ ├── CommonFunctionTest.java │ │ ├── DateTimeFunctionsTest.java │ │ ├── MapFunctionTest.java │ │ ├── Murmur64Test.java │ │ ├── NumericFunctionTest.java │ │ ├── StringFunctionTest.java │ │ ├── SystemFunctionTest.java │ │ ├── VectorFunctionTest.java │ │ └── udf/ │ │ ├── DESUtilTest.java │ │ ├── DesDecryptTest.java │ │ └── DesEncryptTest.java │ └── validator/ │ └── DataValidatorTransformTest.java ├── seatunnel-translation/ │ ├── pom.xml │ ├── seatunnel-translation-base/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── translation/ │ │ │ ├── serialization/ │ │ │ │ ├── RowConverter.java │ │ │ │ └── SerializerConverter.java │ │ │ ├── sink/ │ │ │ │ ├── SinkAggregatedCommitterConverter.java │ │ │ │ ├── SinkCommitterConverter.java │ │ │ │ ├── SinkConverter.java │ │ │ │ └── SinkWriterConverter.java │ │ │ ├── source/ │ │ │ │ ├── BaseSourceFunction.java │ │ │ │ ├── CoordinatedEnumeratorContext.java │ │ │ │ ├── CoordinatedReaderContext.java │ │ │ │ ├── CoordinatedSource.java │ │ │ │ ├── ParallelEnumeratorContext.java │ │ │ │ ├── ParallelReaderContext.java │ │ │ │ └── ParallelSource.java │ │ │ └── util/ │ │ │ └── ThreadPoolExecutorFactory.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── translation/ │ │ └── source/ │ │ └── ParallelSourceTest.java │ ├── seatunnel-translation-flink/ │ │ ├── pom.xml │ │ ├── seatunnel-translation-flink-13/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── translation/ │ │ │ └── flink/ │ │ │ ├── metric/ │ │ │ │ ├── FlinkGroupCounter.java │ │ │ │ └── FlinkMetricContext.java │ │ │ └── sink/ │ │ │ └── FlinkSinkWriterContext.java │ │ ├── seatunnel-translation-flink-15/ │ │ │ └── pom.xml │ │ ├── seatunnel-translation-flink-20/ │ │ │ ├── pom.xml │ │ │ └── src/ │ │ │ └── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── translation/ │ │ │ └── flink/ │ │ │ ├── metric/ │ │ │ │ └── FlinkMetricContext.java │ │ │ ├── serialization/ │ │ │ │ └── EmptyFlinkWriterStateSerializer.java │ │ │ └── sink/ │ │ │ ├── FlinkCommitter.java │ │ │ ├── FlinkSimpleAggregatedCommitter.java │ │ │ ├── FlinkSink.java │ │ │ ├── FlinkSinkWriter.java │ │ │ └── FlinkSinkWriterContext.java │ │ └── seatunnel-translation-flink-common/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ └── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── translation/ │ │ │ └── flink/ │ │ │ ├── metric/ │ │ │ │ ├── FlinkCounter.java │ │ │ │ ├── FlinkJobMetricsSummary.java │ │ │ │ ├── FlinkMeter.java │ │ │ │ └── FlinkMetricContext.java │ │ │ ├── schema/ │ │ │ │ ├── BroadcastSchemaSinkOperator.java │ │ │ │ ├── SchemaOperator.java │ │ │ │ └── coordinator/ │ │ │ │ ├── LocalSchemaCoordinator.java │ │ │ │ └── SinkStateProvider.java │ │ │ ├── serialization/ │ │ │ │ ├── CommitWrapperSerializer.java │ │ │ │ ├── FlinkSimpleVersionedSerializer.java │ │ │ │ └── FlinkWriterStateSerializer.java │ │ │ ├── sink/ │ │ │ │ ├── CommitWrapper.java │ │ │ │ ├── FlinkCommitter.java │ │ │ │ ├── FlinkGlobalCommitter.java │ │ │ │ ├── FlinkSink.java │ │ │ │ ├── FlinkSinkWriter.java │ │ │ │ ├── FlinkSinkWriterContext.java │ │ │ │ └── FlinkWriterState.java │ │ │ └── source/ │ │ │ ├── FlinkRowCollector.java │ │ │ ├── FlinkSource.java │ │ │ ├── FlinkSourceEnumerator.java │ │ │ ├── FlinkSourceReader.java │ │ │ ├── FlinkSourceReaderContext.java │ │ │ ├── FlinkSourceSplitEnumeratorContext.java │ │ │ ├── NoMoreElementEvent.java │ │ │ ├── SourceEventWrapper.java │ │ │ ├── SplitWrapper.java │ │ │ └── SplitWrapperSerializer.java │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── translation/ │ │ └── flink/ │ │ └── source/ │ │ └── FlinkSourceEnumeratorTest.java │ └── seatunnel-translation-spark/ │ ├── pom.xml │ ├── seatunnel-translation-spark-2.4/ │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ ├── java/ │ │ │ └── org/ │ │ │ └── apache/ │ │ │ └── seatunnel/ │ │ │ └── translation/ │ │ │ └── spark/ │ │ │ ├── sink/ │ │ │ │ ├── SparkSink.java │ │ │ │ ├── SparkSinkInjector.java │ │ │ │ └── writer/ │ │ │ │ ├── SparkDataSourceWriter.java │ │ │ │ ├── SparkDataWriter.java │ │ │ │ ├── SparkDataWriterFactory.java │ │ │ │ ├── SparkStreamWriter.java │ │ │ │ └── SparkWriterCommitMessage.java │ │ │ └── source/ │ │ │ ├── SeaTunnelSourceSupport.java │ │ │ ├── partition/ │ │ │ │ ├── batch/ │ │ │ │ │ └── BatchPartition.java │ │ │ │ └── micro/ │ │ │ │ └── MicroBatchPartition.java │ │ │ ├── reader/ │ │ │ │ ├── SeaTunnelInputPartitionReader.java │ │ │ │ ├── batch/ │ │ │ │ │ ├── BatchSourceReader.java │ │ │ │ │ ├── CoordinatedBatchPartitionReader.java │ │ │ │ │ └── ParallelBatchPartitionReader.java │ │ │ │ └── micro/ │ │ │ │ ├── CoordinatedMicroBatchPartitionReader.java │ │ │ │ ├── MicroBatchSourceReader.java │ │ │ │ └── ParallelMicroBatchPartitionReader.java │ │ │ └── state/ │ │ │ ├── MicroBatchState.java │ │ │ └── ReaderState.java │ │ └── resources/ │ │ └── META-INF/ │ │ └── services/ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ ├── seatunnel-translation-spark-3.3/ │ │ ├── pom.xml │ │ └── src/ │ │ ├── main/ │ │ │ ├── java/ │ │ │ │ └── org/ │ │ │ │ └── apache/ │ │ │ │ └── seatunnel/ │ │ │ │ └── translation/ │ │ │ │ └── spark/ │ │ │ │ ├── sink/ │ │ │ │ │ ├── SeaTunnelBatchWrite.java │ │ │ │ │ ├── SeaTunnelSinkTable.java │ │ │ │ │ ├── SeaTunnelSparkSink.java │ │ │ │ │ ├── SparkSinkInjector.java │ │ │ │ │ └── write/ │ │ │ │ │ ├── SeaTunnelSparkDataWriter.java │ │ │ │ │ ├── SeaTunnelSparkDataWriterFactory.java │ │ │ │ │ ├── SeaTunnelSparkWriterCommitMessage.java │ │ │ │ │ ├── SeaTunnelWrite.java │ │ │ │ │ └── SeaTunnelWriteBuilder.java │ │ │ │ └── source/ │ │ │ │ ├── SeaTunnelSourceTable.java │ │ │ │ ├── SeaTunnelSparkSource.java │ │ │ │ ├── partition/ │ │ │ │ │ ├── batch/ │ │ │ │ │ │ ├── CoordinatedBatchPartitionReader.java │ │ │ │ │ │ ├── ParallelBatchPartitionReader.java │ │ │ │ │ │ ├── SeaTunnelBatch.java │ │ │ │ │ │ ├── SeaTunnelBatchInputPartition.java │ │ │ │ │ │ ├── SeaTunnelBatchPartitionReader.java │ │ │ │ │ │ └── SeaTunnelBatchPartitionReaderFactory.java │ │ │ │ │ └── micro/ │ │ │ │ │ ├── CoordinatedMicroBatchPartitionReader.java │ │ │ │ │ ├── ParallelMicroBatchPartitionReader.java │ │ │ │ │ ├── ReaderState.java │ │ │ │ │ ├── SeaTunnelMicroBatch.java │ │ │ │ │ ├── SeaTunnelMicroBatchInputPartition.java │ │ │ │ │ ├── SeaTunnelMicroBatchPartitionReader.java │ │ │ │ │ ├── SeaTunnelMicroBatchPartitionReaderFactory.java │ │ │ │ │ └── SeaTunnelOffset.java │ │ │ │ └── scan/ │ │ │ │ ├── SeaTunnelScan.java │ │ │ │ └── SeaTunnelScanBuilder.java │ │ │ └── resources/ │ │ │ └── META-INF/ │ │ │ └── services/ │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ │ └── test/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── translation/ │ │ └── spark/ │ │ └── sink/ │ │ ├── SeaTunnelSinkWithBuffer.java │ │ ├── SeaTunnelSinkWithBufferWriter.java │ │ └── SparkSinkTest.java │ └── seatunnel-translation-spark-common/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── org/ │ │ └── apache/ │ │ └── seatunnel/ │ │ └── translation/ │ │ └── spark/ │ │ ├── execution/ │ │ │ ├── ColumnWithIndex.java │ │ │ ├── DatasetTableInfo.java │ │ │ ├── IndexQueue.java │ │ │ └── MultiTableManager.java │ │ ├── serialization/ │ │ │ ├── InternalMultiRowCollector.java │ │ │ ├── InternalRowCollector.java │ │ │ ├── InternalRowConverter.java │ │ │ └── SeaTunnelRowConverter.java │ │ └── utils/ │ │ ├── InstantConverterUtils.java │ │ ├── OffsetDateTimeUtils.java │ │ └── TypeConverterUtils.java │ └── test/ │ └── java/ │ └── org/ │ └── apache/ │ └── seatunnel/ │ └── translation/ │ └── spark/ │ └── execution/ │ └── MultiTableManagerTest.java └── tools/ ├── dependencies/ │ ├── checkLicense.sh │ ├── known-dependencies.txt │ └── license.py ├── documents/ │ ├── sync.sh │ └── update_connector_change_log.py ├── github/ │ └── free_disk_space.sh ├── spotless_check/ │ └── pre-commit.sh └── update_modules_check/ ├── check_file_updates.py └── update_modules_check.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .asf.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # github: description: SeaTunnel is a multimodal, high-performance, distributed, massive data integration tool. homepage: https://seatunnel.apache.org/ labels: - data-integration - multimodal - llm - embeddings - change-data-capture - cdc - high-performance - offline - real-time - batch - streaming - data-ingestion - apache - elt collaborators: - dybyte - chl-wxp - LiJie20190102 - yzeng1618 - fcb-xiaobo - LeonYoah - silenceland - SEZ9 - boy-xiaozhang - ZmmBigdata enabled_merge_buttons: squash: true merge: false rebase: false protected_branches: dev: required_status_checks: strict: true required_pull_request_reviews: dismiss_stale_reviews: true required_approving_review_count: 2 notifications: commits: commits@seatunnel.apache.org issues: commits@seatunnel.apache.org pullrequests: commits@seatunnel.apache.org pullrequests_status: commits@seatunnel.apache.org pullrequests_comment: commits@seatunnel.apache.org ================================================ FILE: .dlc.json ================================================ { "ignorePatterns": [ { "pattern": "^http://localhost" }, { "pattern": "^https://mvnrepository.com" }, { "pattern": "^https://www.qutoutiao.net" }, { "pattern": "^https://img.shields.io" }, { "pattern": "^https://json.org/" }, { "pattern": "^/docs/category" }, { "pattern": "^https://opencollective.com" }, { "pattern": "^https://twitter.com/ASFSeaTunnel" }, { "pattern": "^https://github.com/apache/seatunnel/commit/" } ], "timeout": "10s", "retryOn429": true, "retryCount": 10, "fallbackRetryDelay": "1000s", "aliveStatusCodes": [ 0, 200, 401, 403 ] } ================================================ FILE: .gitattributes ================================================ *.sh text eol=lf ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Bug report title: "[Bug] [Module Name] Bug title" description: Problems and issues with code of seatunnel labels: ["bug"] body: - type: markdown attributes: value: | Please make sure what you are reporting is indeed a bug with reproducible steps. For better global communication, Please write in English. If you feel the description in English is not clear, then you can append description in Chinese, thanks! - type: checkboxes attributes: label: Search before asking description: > Please make sure to search in the [issues](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22bug%22) first to see whether the same issue was reported already. options: - label: > I had searched in the [issues](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22bug%22) and found no similar issues. required: true - type: textarea attributes: label: What happened description: Describe what happened. placeholder: > Please provide the context in which the problem occurred and explain what happened validations: required: true - type: textarea attributes: label: SeaTunnel Version description: Provide SeaTunnel version. placeholder: > Please provide the version of SeaTunnel. validations: required: true - type: textarea attributes: label: SeaTunnel Config description: Provide SeaTunnel Config, please delete sensitive information to prevent information leakage placeholder: > Please provide the SeaTunnel Config here. render: conf validations: required: true - type: textarea attributes: label: Running Command description: Provide the command you begin and run SeaTunnel job. placeholder: > Please provide the running command here. render: shell validations: required: true - type: textarea attributes: label: Error Exception description: Provide the error exception when you run your command. placeholder: > Please provide the error exception here. render: log validations: required: true - type: textarea attributes: label: Zeta or Flink or Spark Version description: Provide Zeta or Flink or Spark Version. placeholder: > Please provide the version of Zeta or Flink or Spark. validations: required: false - type: textarea attributes: label: Java or Scala Version description: Provide Java or Scala Version. placeholder: > Please provide the version of Java or Scala. validations: required: false - type: textarea attributes: label: Screenshots description: Provide the screenshots if necessary. placeholder: > Please copy-paste the screenshots here. validations: required: false - type: checkboxes attributes: label: Are you willing to submit PR? description: > This is absolutely not required, but we are happy to guide you in the contribution process especially if you already have a good understanding of how to implement the fix. seatunnel is a totally community-driven project and we love to bring new contributors in. options: - label: Yes I am willing to submit a PR! - type: checkboxes attributes: label: Code of Conduct description: | The Code of Conduct helps create a safe space for everyone. We require that everyone agrees to it. options: - label: > I agree to follow this project's [Code of Conduct](https://www.apache.org/foundation/policies/conduct) required: true - type: markdown attributes: value: "Thanks for completing our form, and we will reply you as soon as possible." ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Feature request description: Suggest an idea for seatunnel title: "[Feature][Module Name] Feature title" labels: ["Feature"] body: - type: markdown attributes: value: | For better global communication, Please write in English. If you feel the description in English is not clear, then you can append description in Chinese, thanks! - type: checkboxes attributes: label: Search before asking description: > Please make sure to search in the [feature](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22Feature%22) first to see whether the same feature was requested already. options: - label: > I had searched in the [feature](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22Feature%22) and found no similar feature requirement. required: true - type: textarea attributes: label: Description description: Please describe the function you want in as much detail as possible. placeholder: > Rather than telling us how you might implement this feature, try to take a step back and describe what you are trying to achieve. validations: required: true - type: textarea attributes: label: Usage Scenario description: Please describe usage scenario of this feature. - type: textarea attributes: label: Related issues description: Is there currently another issue associated with this? - type: checkboxes attributes: label: Are you willing to submit a PR? description: > This is absolutely not required, but we are happy to guide you in the contribution process especially if you already have a good understanding of how to implement the feature. seatunnel is a totally community-driven project and we love to bring new contributors in. options: - label: Yes I am willing to submit a PR! - type: checkboxes attributes: label: Code of Conduct description: | The Code of Conduct helps create a safe space for everyone. We require that everyone agrees to it. options: - label: | I agree to follow this project's [Code of Conduct](https://www.apache.org/foundation/policies/conduct) required: true - type: markdown attributes: value: "Thanks for completing our form, and we will reply you as soon as possible." ================================================ FILE: .github/ISSUE_TEMPLATE/umbrella.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Umbrella title: "[Umbrella] " description: An umbrella issue with multiple sub-tasks labels: [ "umbrella" ] body: - type: checkboxes attributes: label: Code of Conduct description: The Code of Conduct helps create a safe space for everyone. We require that everyone agrees to it. options: - label: > I agree to follow this project's [Code of Conduct](https://www.apache.org/foundation/policies/conduct) required: true - type: checkboxes attributes: label: Search before asking description: > Please make sure to search in the [issues](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22bug%22) first to see whether the same issue was reported already. options: - label: > I had searched in the [issues](https://github.com/apache/seatunnel/issues?q=is%3Aissue+label%3A%22bug%22) and found no similar issues. required: true - type: textarea attributes: label: Describe the proposal placeholder: > Please describe the content of the proposal clearly. validations: required: true - type: textarea attributes: label: Task list description: > For more details, please refer to [github docs](https://docs.github.com/en/issues/tracking-your-work-with-issues/about-task-lists). placeholder: > Please create sub-tasks with the pre-create issues here and @ the assignees if you know any of them. A simple example is as follows: - [ ] #1 - [ ] #2 @user1 - [ ] #3 - [ ] #2 @user2 - [ ] #3 validations: required: true - type: checkboxes attributes: label: Are you willing to submit PR? description: > This is absolutely not required, but we are happy to guide you in the contribution process especially if you already have a good understanding of how to implement the fix. seatunnel is a totally community-driven project and we love to bring new contributors in. options: - label: Yes I am willing to submit a PR! - type: markdown attributes: value: "Thanks for taking the time to propose an umbrella issue!" ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ### Purpose of this pull request ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ### Check list * [ ] If any new Jar binary package adding in your PR, please add License Notice according [New License Guide](https://github.com/apache/seatunnel/blob/dev/docs/en/contribution/new-license.md) * [ ] If necessary, please update the documentation to describe the new feature. https://github.com/apache/seatunnel/tree/dev/docs * [ ] If necessary, please update `incompatible-changes.md` to describe the incompatibility caused by this PR. * [ ] If you are contributing the connector code, please check that the following files are updated: 1. Update [plugin-mapping.properties](https://github.com/apache/seatunnel/blob/dev/plugin-mapping.properties) and add new connector information in it 2. Update the pom file of [seatunnel-dist](https://github.com/apache/seatunnel/blob/dev/seatunnel-dist/pom.xml) 3. Add ci label in [label-scope-conf](https://github.com/apache/seatunnel/blob/dev/.github/workflows/labeler/label-scope-conf.yml) 4. Add e2e testcase in [seatunnel-e2e](https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/) 5. Update connector [plugin_config](https://github.com/apache/seatunnel/blob/dev/config/plugin_config) ================================================ FILE: .github/workflows/add-label.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the 'License'); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Pull Request Labeler on: pull_request_target: types: [opened, reopened, synchronize] jobs: labeler: permissions: contents: read pull-requests: write runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/labeler@v5 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: '.github/workflows/labeler/label-scope-conf.yml' sync-labels: true ================================================ FILE: .github/workflows/approve-label-trigger.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # name: Label-when-reviewed on: pull_request_review jobs: label-when-reviewed: name: "Label PRs when reviewed" runs-on: ubuntu-latest steps: - name: "Do nothing. Only trigger corresponding workflow_run event" run: echo ================================================ FILE: .github/workflows/approve-label.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # name: "Label when approved workflow run" on: workflow_run: workflows: [Label-when-reviewed] types: [requested] permissions: # All other permissions are set to none checks: write contents: read pull-requests: write jobs: label-when-approved: name: "Label when approved" runs-on: ubuntu-latest outputs: isApprovedByCommiters: ${{ steps.label-when-approved-by-commiters.outputs.isApproved }} isApprovedByAnyone: ${{ steps.label-when-approved-by-anyone.outputs.isApproved }} steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" uses: actions/checkout@v2 with: persist-credentials: false submodules: recursive - name: "Get information about the original trigger of the run" uses: ./.github/actions/get-workflow-origin id: source-run-info with: token: ${{ secrets.GITHUB_TOKEN }} sourceRunId: ${{ github.event.workflow_run.id }} - name: Label when approved by commiters uses: ./.github/actions/label-when-approved-action id: label-when-approved-by-commiters with: token: ${{ secrets.GITHUB_TOKEN }} label: 'approved' require_committers_approval: 'true' remove_label_when_approval_missing: 'true' pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} - name: Label when approved by anyone uses: ./.github/actions/label-when-approved-action id: label-when-approved-by-anyone with: token: ${{ secrets.GITHUB_TOKEN }} label: 'reviewed' pullRequestNumber: ${{ steps.source-run-info.outputs.pullRequestNumber }} remove_label_when_approval_missing: 'true' ================================================ FILE: .github/workflows/backend.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the 'License'); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Backend on: workflow_call: inputs: TEST_IN_PR: required: false type: string default: 'true' concurrency: group: backend-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true env: TEST_IN_PR: ${{ inputs.TEST_IN_PR }} jobs: license-header: name: License header runs-on: ubuntu-latest timeout-minutes: 10 steps: - uses: actions/checkout@v3 with: submodules: true - name: Check license header uses: apache/skywalking-eyes@v0.5.0 code-style: name: Code style runs-on: ubuntu-latest timeout-minutes: 10 steps: - uses: actions/checkout@v3 with: submodules: true - name: Check code style run: ./mvnw --batch-mode --quiet --no-snapshot-updates clean spotless:check - name: Check code specification run: ./mvnw -B -T 1 clean test -D"license.skipAddThirdParty"=true -pl seatunnel-ci-tools -am --no-snapshot-updates env: MAVEN_OPTS: -Xmx512m - name: Check for .class files in git run: | echo "Checking for .class files tracked by git..." # Find all .class files tracked by git CLASS_FILES=$(git ls-files '*.class') if [ -n "$CLASS_FILES" ]; then echo "ERROR: The following .class files are tracked by git:" echo "$CLASS_FILES" echo "" echo "Please remove .class files from the repository." echo "These files should not be committed. You can remove them using:" echo " git rm --cached .class" echo " git commit -m 'Remove .class files'" echo "" echo "Also, consider adding '*.class' to .gitignore if not already present." exit 1 else echo "No .class files found in git repository." fi helm-chart-check: name: Check Helm Chart Syntax needs: [ license-header, code-style] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Setup Helm uses: azure/setup-helm@v4.3.0 id: install - name: Lint Chart run: helm lint deploy/kubernetes/seatunnel # dead-link: # name: Dead links # runs-on: ubuntu-latest # timeout-minutes: 150 # # Temporarily ignore this job to avoid blocking PRs # continue-on-error: true # steps: # - uses: actions/checkout@v2 # - run: sudo npm install -g markdown-link-check@3.8.7 # - run: | # for file in $(find . -name "*.md"); do # markdown-link-check -c .dlc.json -q "$file" # done sanity-check: name: Sanity check results needs: [ license-header, code-style ] runs-on: ubuntu-latest timeout-minutes: 10 steps: - name: Check results run: | [[ ${{ needs.license-header.result }} == 'success' ]] || exit 1; [[ ${{ needs.code-style.result }} == 'success' ]] || exit 1; changes: runs-on: ubuntu-latest timeout-minutes: 10 outputs: api: ${{ steps.filter.outputs.api }} engine: ${{ steps.filter.outputs.engine }} engine-e2e: ${{ steps.filter.outputs.engine-e2e }} docs: ${{ steps.filter.outputs.docs }} ut-modules: ${{ steps.ut-modules.outputs.modules }} it-modules: ${{ steps.it-modules.outputs.modules }} steps: - uses: actions/checkout@v4 with: fetch-depth: '2000' - name: checkout apache seatunnel dev branch id: git_init run: | /usr/bin/git remote add apache https://github.com/apache/seatunnel /usr/bin/git -c protocol.version=2 fetch --no-tags --prune --no-recurse-submodules --depth=2000 apache +refs/heads/dev*:refs/remotes/apache/dev* +refs/tags/dev*:refs/tags/dev* /usr/bin/git checkout apache/dev /usr/bin/git checkout '${{ github.ref }}' echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT - uses: actions/setup-python@v4 with: python-version: '3.11.0' - name: Check for file changes by python id: filter run: | current_branch='${{ steps.git_init.outputs.branch }}' pip install GitPython workspace="${GITHUB_WORKSPACE}" repository_owner="${GITHUB_REPOSITORY_OWNER}" cv2_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "seatunnel-connectors-v2/**"` true_or_false=${cv2_files%%$'\n'*} file_list=${cv2_files#*$'\n'} echo "cv2=$true_or_false" >> $GITHUB_OUTPUT echo "cv2_files=$file_list" >> $GITHUB_OUTPUT cv2_e2e_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "seatunnel-e2e/seatunnel-connector-v2-e2e/**"` true_or_false=${cv2_e2e_files%%$'\n'*} file_list=${cv2_e2e_files#*$'\n'} echo "cv2-e2e=$true_or_false" >> $GITHUB_OUTPUT echo "cv2-e2e_files=$file_list" >> $GITHUB_OUTPUT engine_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "seatunnel-engine/**"` true_or_false=${engine_files%%$'\n'*} file_list=${engine_files#*$'\n'} echo "engine=$true_or_false" >> $GITHUB_OUTPUT echo "engine_files=$file_list" >> $GITHUB_OUTPUT deleted_poms_files=`python tools/update_modules_check/check_file_updates.py d $workspace apache/dev origin/$current_branch "**/pom.xml"` true_or_false=${deleted_poms_files%%$'\n'*} file_list=${deleted_poms_files#*$'\n'} echo "deleted-poms=$true_or_false" >> $GITHUB_OUTPUT echo "deleted-poms_files=$file_list" >> $GITHUB_OUTPUT doc_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "docs/**"` true_or_false=${doc_files%%$'\n'*} file_list=${doc_files#*$'\n'} echo "docs=$true_or_false" >> $GITHUB_OUTPUT echo "docs_files=$file_list" >> $GITHUB_OUTPUT engine_e2e_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "seatunnel-e2e/seatunnel-engine-e2e/**"` true_or_false=${engine_e2e_files%%$'\n'*} file_list=${engine_e2e_files#*$'\n'} echo "engine-e2e=$true_or_false" >> $GITHUB_OUTPUT echo "engine-e2e_files=$file_list" >> $GITHUB_OUTPUT api_files=`python tools/update_modules_check/check_file_updates.py ua $workspace apache/dev origin/$current_branch "seatunnel-api/**" "seatunnel-common/**" "seatunnel-config/**" "seatunnel-core/**" "seatunnel-e2e/seatunnel-e2e-common/**" "seatunnel-formats/**" "seatunnel-plugin-discovery/**" "seatunnel-transforms-v2/**" "seatunnel-translation/**" "seatunnel-e2e/seatunnel-transforms-v2-e2e/**" "pom.xml" "**/workflows/**" "tools/**" "seatunnel-dist/**"` true_or_false=${api_files%%$'\n'*} file_list=${api_files#*$'\n'} if [[ $repository_owner == 'apache' ]];then true_or_false='true' fi echo "api=$true_or_false" >> $GITHUB_OUTPUT echo "api_files=$file_list" >> $GITHUB_OUTPUT - name: Check Connector V2 Update id: cv2-modules if: ${{ steps.filter.outputs.cv2 == 'true' }} run: | update_files='${{ steps.filter.outputs.cv2_files }}' modules=`python tools/update_modules_check/update_modules_check.py cv2 "$update_files"` echo $modules echo "modules=$modules" >> $GITHUB_OUTPUT - name: Check Connector V2 E2E Update id: cv2-e2e-modules if: ${{ steps.filter.outputs.cv2-e2e == 'true' }} run: | update_files='${{ steps.filter.outputs.cv2-e2e_files }}' modules=`python tools/update_modules_check/update_modules_check.py cv2-e2e "$update_files"` echo $modules echo "modules=$modules" >> $GITHUB_OUTPUT - name: Check Engine Update id: engine-modules if: ${{ steps.filter.outputs.engine == 'true' }} run: | update_files='${{ steps.filter.outputs.engine_files }}' modules=`python tools/update_modules_check/update_modules_check.py engine "$update_files"` echo $modules echo "modules=$modules" >> $GITHUB_OUTPUT - name: Check Engine E2E Update id: engine-e2e-modules if: ${{ steps.filter.outputs.engine-e2e == 'true' }} run: | update_files='${{ steps.filter.outputs.engine-e2e_files }}' modules=`python tools/update_modules_check/update_modules_check.py engine-e2e "$update_files"` echo $modules echo "modules=$modules" >> $GITHUB_OUTPUT - name: Check Deleted Modules id: deleted-modules if: ${{ steps.filter.outputs.deleted-poms == 'true' }} run: | update_files='${{ steps.filter.outputs.deleted-poms_files }}' modules=`python tools/update_modules_check/update_modules_check.py delete "$update_files"` echo $modules echo "modules=$modules" >> $GITHUB_OUTPUT - name: Make unit test modules id: ut-modules timeout-minutes: 60 if: ${{ steps.filter.outputs.api == 'false' && (steps.engine-modules.outputs.modules != '' || steps.cv2-modules.outputs.modules != '') }} run: | modules='${{ steps.engine-modules.outputs.modules }}${{ steps.cv2-modules.outputs.modules }}' modules=${modules: 1} pl_modules=`python tools/update_modules_check/update_modules_check.py replace "$modules"` # remove deleted modules delete_modules='${{ steps.deleted-modules.outputs.modules }}' if [[ "zz"$delete_modules != "zz" ]];then pl_modules=`python tools/update_modules_check/update_modules_check.py rm "$pl_modules" "$delete_modules"` fi if [[ "zz"$pl_modules == "zz" ]];then exit 0 fi ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl $pl_modules > /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` tree_modules="$modules$sub_modules" includes=`python tools/update_modules_check/update_modules_check.py tree "$tree_modules"` ./mvnw -Pci -D"e2e.dependency.skip"=false dependency:tree $includes -DoutputType=text -DoutputFile=/tmp/tree_out.txt build_modules=`python tools/update_modules_check/update_modules_check.py final_ut /tmp/tree_out.txt` if [[ "zz"$build_modules == "zz" ]];then build_modules=$pl_modules fi echo $build_modules echo "modules=$build_modules" >> $GITHUB_OUTPUT - name: Make integration test modules id: it-modules timeout-minutes: 60 if: ${{ steps.filter.outputs.api == 'false' && (steps.engine-modules.outputs.modules != '' || steps.cv2-modules.outputs.modules != '' || steps.cv2-e2e-modules.outputs.modules != '' || steps.cv2-flink-e2e-modules.outputs.modules != '' || steps.cv2-spark-e2e-modules.outputs.modules != '') }} run: | modules='${{ steps.cv2-e2e-modules.outputs.modules }}${{ steps.cv2-flink-e2e-modules.outputs.modules }}${{ steps.cv2-spark-e2e-modules.outputs.modules }}${{ steps.engine-e2e-modules.outputs.modules }}${{ steps.engine-modules.outputs.modules }}${{ steps.cv2-modules.outputs.modules }}' modules=${modules: 1} pl_modules=`python tools/update_modules_check/update_modules_check.py replace "$modules"` # remove deleted modules delete_modules='${{ steps.deleted-modules.outputs.modules }}' if [[ "zz"$delete_modules != "zz" ]];then pl_modules=`python tools/update_modules_check/update_modules_check.py rm "$pl_modules" "$delete_modules"` fi if [[ "zz"$pl_modules == "zz" ]];then exit 0 fi ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl $pl_modules > /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` tree_modules="$modules$sub_modules" includes=`python tools/update_modules_check/update_modules_check.py tree "$tree_modules"` ./mvnw -Pci -D"e2e.dependency.skip"=false dependency:tree $includes -DoutputType=text -DoutputFile=/tmp/tree_out.txt build_modules=`python tools/update_modules_check/update_modules_check.py final_it /tmp/tree_out.txt` echo $build_modules echo "modules=$build_modules" >> $GITHUB_OUTPUT dependency-license: if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' name: Dependency licenses needs: [ changes, sanity-check ] runs-on: ubuntu-latest timeout-minutes: 60 steps: - uses: actions/checkout@v3 with: submodules: true - uses: actions/setup-java@v3 with: distribution: 'temurin' java-version: '8' cache: 'maven' - name: Install uses: nick-fields/retry@v2 with: timeout_minutes: 40 max_attempts: 3 retry_on: error command: | ./mvnw -B install -DskipTests -D"maven.test.skip"=true -D"maven.javadoc.skip"=true -D"license.skipAddThirdParty" -D"skip.ui"=true - name: Check Dependencies Licenses run: tools/dependencies/checkLicense.sh document: if: needs.changes.outputs.api == 'true' || needs.changes.outputs.docs == 'true' needs: [ changes, sanity-check ] name: Build website runs-on: ubuntu-latest timeout-minutes: 90 steps: - name: Checkout PR uses: actions/checkout@v3 with: path: seatunnel-pr - name: Checkout website repo uses: actions/checkout@v3 with: repository: apache/seatunnel-website path: seatunnel-website - name: Sync PR changes to website run: | bash seatunnel-pr/tools/documents/sync.sh seatunnel-pr seatunnel-website - uses: actions/setup-node@v2 with: node-version: 18.20.7 - name: Run docusaurus build run: | cd seatunnel-website npm set strict-ssl false npm install npm run build seatunnel-ui: if: needs.changes.outputs.api == 'true' needs: [ changes, sanity-check ] name: Build SeaTunnel UI runs-on: ubuntu-latest timeout-minutes: 60 steps: - name: Checkout PR uses: actions/checkout@v3 - uses: actions/setup-node@v2 with: node-version: 20.x - name: Install Dependencies and Check Code Style run: | cd seatunnel-engine/seatunnel-engine-ui/ npm install npm run lint - name: Run unit tests run: | cd seatunnel-engine/seatunnel-engine-ui/ npm run test:unit - name: Build SeaTunnel UI run: | cd seatunnel-engine/seatunnel-engine-ui/ npm run build unit-test: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || (needs.changes.outputs.api == 'false' && needs.changes.outputs.ut-modules != '') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest', 'windows-latest' ] timeout-minutes: 90 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: run all modules unit test run: | ./mvnw -B -T 1 clean verify -DskipUT=false -DskipIT=true -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates env: MAVEN_OPTS: -Xmx4096m updated-modules-integration-test-part-1: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-1) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 0` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m updated-modules-integration-test-part-2: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-2) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 1` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx4096m updated-modules-integration-test-part-3: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-3) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 2` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m updated-modules-integration-test-part-4: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 200 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-4) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 3` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx4096m updated-modules-integration-test-part-5: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-5) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 4` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m updated-modules-integration-test-part-6: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-6) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 5` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m updated-modules-integration-test-part-7: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-7) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 6` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m updated-modules-integration-test-part-8: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'false' && needs.changes.outputs.engine == 'false' && needs.changes.outputs.it-modules != '' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run updated modules integration test (part-8) run: | sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 8 7` if [ ! -z $sub_modules ]; then echo $sub_modules ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $sub_modules -am -Pci else echo "sub modules is empty, skipping" fi env: MAVEN_OPTS: -Xmx2048m engine-v2-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || needs.changes.outputs.engine-e2e == 'true' runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run seatunnel zeta integration test run: | ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-seatunnel-e2e-base,:connector-console-seatunnel-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m engine-k8s-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || contains(needs.changes.outputs.it-modules, 'seatunnel-engine-k8s-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 45 steps: - name: install k8s run: | curl -sfL https://get.k3s.io | K3S_KUBECONFIG_MODE=777 sh -s - --docker cat /etc/rancher/k3s/k3s.yaml mkdir -p ~/.kube cp /etc/rancher/k3s/k3s.yaml ~/.kube/config env: KUBECONFIG: /etc/rancher/k3s/k3s.yaml - uses: actions/checkout@v2 - name: free disk space run: tools/github/free_disk_space.sh - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: run seatunnel zeta on k8s test run: | ./mvnw -T 1 -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :seatunnel-engine-k8s-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m KUBECONFIG: /etc/rancher/k3s/k3s.yaml transform-v2-it-part-1: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run transform-v2 integration test (part-1) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :seatunnel-transforms-v2-e2e-part-1 -am -Pci env: MAVEN_OPTS: -Xmx4096m transform-v2-it-part-2: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 150 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run transform-v2 integration test (part-2) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :seatunnel-transforms-v2-e2e-part-2 -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-1: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-1) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 0` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-2: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 150 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-2) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 1` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-3: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-3) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 2` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-4: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-4) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 3` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-5: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-5) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 4` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-6: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-6) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 5` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m all-connectors-it-7: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run connector-v2 integration test (part-7) run: | ./mvnw help:evaluate -Dexpression=project.modules -q -DforceStdout -pl :seatunnel-connector-v2-e2e >> /tmp/sub_module.txt sub_modules=`python tools/update_modules_check/update_modules_check.py sub /tmp/sub_module.txt` run_it_modules=`python tools/update_modules_check/update_modules_check.py sub_it_module "$sub_modules" 7 6` ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl $run_it_modules -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-1: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-1) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-1 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-2: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-2) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-2 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-3: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-3) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-3 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-4: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-4) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-4 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-5: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-5) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-5 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-6: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-6) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-6 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-part-7: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (part-7) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-7 -am -Pci env: MAVEN_OPTS: -Xmx4096m jdbc-connectors-it-ddl: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' runs-on: ${{ matrix.os }} env: RUN_ALL_CONTAINER: ${{ needs.changes.outputs.api }} RUN_ZETA_CONTAINER: ${{ needs.changes.outputs.engine }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run jdbc connectors integration test (sink ddl) run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-jdbc-e2e-ddl -am -Pci env: MAVEN_OPTS: -Xmx4096m kudu-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-kudu-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 60 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run kudu connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-kudu-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m amazonSqs-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-amazonsqs-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run amazonsqs connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-amazonsqs-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m kafka-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-kafka-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run kafka connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-kafka-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m rocketmq-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-rocketmq-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run rocket connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-rocketmq-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m doris-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-doris-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run doris connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-doris-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m paimon-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-paimon-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run paimon connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-paimon-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m oracle-cdc-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-cdc-oracle-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - name: Checkout repository uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run oracle cdc connector integration test uses: nick-fields/retry@v3 with: timeout_seconds: 9000 max_attempts: 3 retry_on: error command: | echo 'running oracle cdc connector integration test...' && \ ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-cdc-oracle-e2e -am -Pci connector-file-local-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-file-local-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run file local connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-file-local-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m connector-file-sftp-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-file-sftp-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 120 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run file sftp connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-file-sftp-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m connector-redis-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true' || contains(needs.changes.outputs.it-modules, 'connector-redis-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 210 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run redis connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-redis-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m connector-sensorsdata-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' || contains(needs.changes.outputs.it-modules, 'connector-sensorsdata-e2e') runs-on: ${{ matrix.os }} strategy: matrix: java: [ '8', '11' ] os: [ 'ubuntu-latest' ] timeout-minutes: 180 steps: - uses: actions/checkout@v2 - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v3 with: java-version: ${{ matrix.java }} distribution: 'temurin' cache: 'maven' - name: free disk space run: tools/github/free_disk_space.sh - name: run sensorsdata connector integration test run: | ./mvnw -B -T 1 verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true -D"skip.ui"=true --no-snapshot-updates -pl :connector-sensorsdata-e2e -am -Pci env: MAVEN_OPTS: -Xmx4096m ================================================ FILE: .github/workflows/build_main.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # name: "Build" on: push: branches: - '**' jobs: call-build-and-test: permissions: packages: write name: Run uses: ./.github/workflows/backend.yml ================================================ FILE: .github/workflows/codeql.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "CodeQL" on: schedule: - cron: '0 0 12 * *' jobs: analyze: name: Analyze runs-on: ubuntu-latest timeout-minutes: 120 env: JAVA_TOOL_OPTIONS: -Xmx2G -Xms2G -Dhttp.keepAlive=false -Dmaven.test.skip=true -Dlicense.skipAddThirdParty=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false -Dmaven.wagon.http.retryHandler.count=3 -Dmaven.wagon.httpconnectionManager.ttlSeconds=120 strategy: fail-fast: false matrix: language: ['java'] steps: - name: Checkout repository uses: actions/checkout@v2 with: submodules: true - name: Set up JDK 1.8 uses: actions/setup-java@v2 with: java-version: 8 distribution: 'adopt' - name: Cache local Maven repository uses: actions/cache@v4 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} - name: Autobuild uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 ================================================ FILE: .github/workflows/labeler/label-scope-conf.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the 'License'); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # CI&CD: - changed-files: - any-glob-to-any-file: - .github/** Zeta: - changed-files: - any-glob-to-any-file: seatunnel-engine/** e2e: - changed-files: - any-glob-to-any-file: seatunnel-e2e/** document: - changed-files: - any-glob-to-any-file: docs/** flink: - changed-files: - any-glob-to-any-file: - seatunnel-translation/seatunnel-translation-flink/** spark: - changed-files: - any-glob-to-any-file: - seatunnel-translation/seatunnel-translation-spark/** Zeta Rest API: - changed-files: - any-glob-to-any-file: seatunnel-engine/**/server/rest/** api: - changed-files: - any-glob-to-any-file: - seatunnel-api/** - seatunnel-common/** core: - changed-files: - any-glob-to-any-file: - seatunnel-core/** - seatunnel-config/** - seatunnel-dist/** - seatunnel-plugin-discovery/** - seatunnel-shade/** format: - changed-files: - any-glob-to-any-file: seatunnel-formats/** dependencies: - changed-files: - any-glob-to-any-file: tools/dependencies/** connectors-v2: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/** transform-v2: - changed-files: - any-glob-to-any-file: seatunnel-transforms-v2/** # Connectors amazondynamodb: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-amazondynamodb/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(amazondynamodb)/**' amazonsqs: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-amazonsqs/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(amazonsqs)/**' cassandra: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-cassandra/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(cassandra)/**' cdc: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-cdc/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(cdc)/**' clickhouse: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-clickhouse/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(clickhouse)/**' databend: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-databend/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(databend)/**' datahub: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-datahub/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(datahub)/**' dingtalk: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-dingtalk/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(dingtalk)/**' doris: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-doris/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(doris)/**' druid: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-druid/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(druid)/**' easysearch: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-easysearch/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(easysearch)/**' elasticsearch: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-elasticsearch/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(elasticsearch)/**' email: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-email/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(email)/**' file: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-file/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(file)/**' google-firestore: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-google-firestore/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(google-firestore)/**' google-sheets: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-google-sheets/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(google-sheets)/**' graphql: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-graphql/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(graphql)/**' hbase: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-hbase/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(hbase)/**' hive: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-hive/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(hive)/**' http: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-http/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(http)/**' prometheus: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-prometheus/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(prometheus)/**' hudi: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-hudi/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(hudi)/**' iceberg: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-iceberg/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(iceberg)/**' influxdb: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-influxdb/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(influxdb)/**' iotdb: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-iotdb/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(iotdb)/**' jdbc: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-jdbc/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(jdbc)/**' kafka: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-kafka/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(kafka)/**' maxcompute: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-maxcompute/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(maxcompute)/**' mongodb: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-mongodb/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(mongodb)/**' neo4j: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-neo4j/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(neo4j)/**' openmldb: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-openmldb/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(openmldb)/**' paimon: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-paimon/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(paimon)/**' pulsar: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-pulsar/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(pulsar)/**' rabbitmq: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-rabbitmq/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(rabbitmq)/**' redis: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-redis/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(redis)/**' rocketmq: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-rocketmq/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(rocketmq)/**' s3-redshift: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-s3-redshift/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(s3-redshift)/**' selectdb-cloud: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-selectdb-cloud/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(selectdb-cloud)/**' sentry: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-sentry/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(sentry)/**' socket: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-socket/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(socket)/**' starrocks: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-starrocks/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(starrocks)/**' tablestore: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-tablestore/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(tablestore)/**' tdengine: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-tdengine/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(tdengine)/**' web3j: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-web3j/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(web3j)/**' Milvus: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-milvus/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(milvus)/**' activemq: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-activemq/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(activemq)/**' qdrant: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-qdrant/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(qdrant)/**' typesense: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-typesense/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(typesense)/**' sls: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-sls/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(sls)/**' aerospike: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-aerospike/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(aerospike)/**' sensorsdata: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-sensorsdata/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(sensorsdata)/**' hugegraph: - all: - changed-files: - any-glob-to-any-file: seatunnel-connectors-v2/connector-hugegraph/** - all-globs-to-all-files: '!seatunnel-connectors-v2/connector-!(hugegraph)/**' ================================================ FILE: .github/workflows/notify_test_workflow.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Intentionally has a general name. # because the test status check created in GitHub Actions # currently randomly picks any associated workflow. # So, the name was changed to make sense in that context too. # See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10 name: On pull request update on: pull_request_target: types: [opened, reopened, synchronize] jobs: notify: name: Notify test workflow runs-on: ubuntu-latest permissions: actions: read checks: write steps: - name: "Notify test workflow" uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch' const check_run_endpoint = 'GET /repos/:owner/:repo/commits/:ref/check-runs?per_page=100' // TODO: Should use pull_request.user and pull_request.user.repos_url? // If a different person creates a commit to another forked repo, // it wouldn't be able to detect. const params = { owner: context.payload.pull_request.head.repo.owner.login, repo: context.payload.pull_request.head.repo.name, id: 'build_main.yml', branch: context.payload.pull_request.head.ref, } const check_run_params = { owner: context.payload.pull_request.head.repo.owner.login, repo: context.payload.pull_request.head.repo.name, ref: context.payload.pull_request.head.ref, } console.log('Ref: ' + context.payload.pull_request.head.ref) console.log('SHA: ' + context.payload.pull_request.head.sha) // Wait 3 seconds to make sure the fork repository triggered a workflow. await new Promise(r => setTimeout(r, 3000)) let runs try { runs = await github.request(endpoint, params) } catch (error) { console.error(error) // Assume that runs were not found. } const name = 'Build' const head_sha = context.payload.pull_request.head.sha let status = 'queued' console.log('runs: ' + JSON.stringify(runs)) if (!runs || runs.data.workflow_runs.length === 0) { status = 'completed' const conclusion = 'action_required' await github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: name, head_sha: head_sha, status: status, conclusion: conclusion, output: { title: 'Workflow run detection failed', summary: ` Unable to detect the workflow run for testing the changes in your PR. 1. If you did not enable GitHub Actions in your forked repository, please enable it by clicking the button as shown in the image below. See also [Disabling or limiting GitHub Actions for a repository](https://docs.github.com/en/github/administering-a-repository/disabling-or-limiting-github-actions-for-a-repository) for more details. 2. Create and push an empty commit to trigger the workflow. 3. It is possible your branch is based on the old \`dev\` branch in Apache SeaTunnel, please sync your branch to the latest dev branch. For example as below: \`\`\`bash git fetch upstream git rebase upstream/dev git push origin YOUR_BRANCH --force \`\`\``, images: [ { alt: 'enabling workflows button', image_url: 'https://raw.githubusercontent.com/apache/spark/master/.github/workflows/images/workflow-enable-button.png' } ] } }) } else { const run_id = runs.data.workflow_runs[0].id if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) { throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.'); } // Here we get check run ID to provide Check run view instead of Actions view, see also SPARK-37879. const check_runs = await github.request(check_run_endpoint, check_run_params) console.log('check_runs: ' + JSON.stringify(check_runs)) const check_run_head = check_runs.data.check_runs.filter(r => r.name === "Run / License header")[0] console.log('check_run_head: ' + JSON.stringify(check_run_head)) if (check_run_head.head_sha != context.payload.pull_request.head.sha) { throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.'); } const check_run_url = 'https://github.com/' + context.payload.pull_request.head.repo.full_name + '/runs/' + check_run_head.id const actions_url = 'https://github.com/' + context.payload.pull_request.head.repo.full_name + '/actions/runs/' + run_id await github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: name, head_sha: head_sha, status: status, output: { title: 'Test results', summary: '[See test results](' + check_run_url + ')', text: JSON.stringify({ owner: context.payload.pull_request.head.repo.owner.login, repo: context.payload.pull_request.head.repo.name, run_id: run_id }) }, details_url: actions_url, }) } ================================================ FILE: .github/workflows/publish-docker.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: publish-docker on: push: tags: - '*' paths-ignore: - 'docs/**' - '**/*.md' env: DOCKER_USERNAME: ${{ secrets.DOCKERHUB_USER }} DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} jobs: build: if: github.repository == 'apache/seatunnel' runs-on: ubuntu-latest permissions: contents: read packages: write timeout-minutes: 60 steps: - uses: actions/checkout@v4 with: submodules: true - name: free disk space run: tools/github/free_disk_space.sh - uses: actions/checkout@v4 - name: Cache local Maven repository uses: actions/cache@v4 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | ${{ runner.os }}-maven- - name: Set up JDK 1.8 uses: actions/setup-java@v2 with: java-version: 8 distribution: 'adopt' - name: Log in to the Container registry uses: docker/login-action@v3 with: username: ${{ env.DOCKER_USERNAME }} password: ${{ env.DOCKER_PASSWORD }} - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Build and push docker images env: MAVEN_OPTS: -Xmx4096m run: | ./mvnw -B clean install \ -Dmaven.test.skip=true \ -Dmaven.javadoc.skip=true \ -Dlicense.skipAddThirdParty=true \ -D"docker.build.skip"=false \ -D"docker.verify.skip"=false \ -D"docker.push.skip"=false \ -D"skip.spotless"=true \ -Dmaven.deploy.skip \ --no-snapshot-updates \ -Pdocker,seatunnel ================================================ FILE: .github/workflows/publish-helm-chart.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: publish-helm-chart on: push: tags: - '*' paths-ignore: - 'docs/**' - '**/*.md' env: DOCKER_USERNAME: ${{ secrets.DOCKERHUB_USER }} DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} DOCKER_REGISTRY: docker.io HUB: registry-1.docker.io/apache jobs: build: if: github.repository == 'apache/seatunnel' runs-on: ubuntu-latest permissions: contents: read packages: write timeout-minutes: 30 steps: - uses: actions/checkout@v4 - name: Log in to the Container registry uses: docker/login-action@v3 with: registry: ${{ env.DOCKER_REGISTRY }} username: ${{ env.DOCKER_USERNAME }} password: ${{ env.DOCKER_PASSWORD }} - name: Publish Helm Chart working-directory: deploy/kubernetes run: | helm dep up seatunnel helm package seatunnel helm push seatunnel-helm-*.tgz oci://${{ env.HUB }} ================================================ FILE: .github/workflows/schedule_backend.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the 'License'); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # name: Schedule Backend on: schedule: - cron: '0 16 * * *' concurrency: group: schedule-backend-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: false jobs: call-build-and-test: permissions: packages: write name: Run uses: ./.github/workflows/backend.yml with: TEST_IN_PR: false ================================================ FILE: .github/workflows/stale.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # https://github.com/actions/stale name: 'Close stale issues and PRs' on: schedule: - cron: '0 0 * * *' permissions: # Stale recommended permissions pull-requests: write issues: write jobs: stale: runs-on: ubuntu-latest steps: - uses: actions/stale@v4 with: # Stale Issues days-before-issue-stale: -1 days-before-issue-close: -1 # We do not stale Issues with label `Waiting for reply`, `Waiting for code update`,`Waiting for users feedback`, `New feature` and `STIP` exempt-issue-labels: 'Waiting for reply,Waiting for code update,Waiting for users feedback,New feature,STIP,security' stale-issue-message: > This issue has been automatically marked as stale because it has not had recent activity for 30 days. It will be closed in next 7 days if no further activity occurs. close-issue-message: > This issue has been closed because it has not received response for too long time. You could reopen it if you encountered similar problems in the future. # Stale PRs days-before-pr-stale: 120 days-before-pr-close: 7 stale-pr-message: > This pull request has been automatically marked as stale because it has not had recent activity for 120 days. It will be closed in 7 days if no further activity occurs. close-pr-message: > This pull request has been closed because it has not had recent activity. You could reopen it if you try to continue your work, and anyone who are interested in it are encouraged to continue work on this pull request. remove-pr-stale-when-updated: true remove-issue-stale-when-updated: true operations-per-run: 1000 ================================================ FILE: .github/workflows/update_build_status.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # name: Update build status workflow on: schedule: - cron: "*/15 * * * *" jobs: update: name: Update build status runs-on: ubuntu-latest permissions: actions: read checks: write steps: - name: "Update build status" uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const endpoint = 'GET /repos/:owner/:repo/pulls?state=:state' const params = { owner: context.repo.owner, repo: context.repo.repo, state: 'open' } // See https://docs.github.com/en/graphql/reference/enums#mergestatestatus const maybeReady = ['behind', 'clean', 'draft', 'has_hooks', 'unknown', 'unstable']; // Iterate open PRs for await (const prs of github.paginate.iterator(endpoint,params)) { // Each page for await (const pr of prs.data) { console.log('SHA: ' + pr.head.sha) console.log(' Mergeable status: ' + pr.mergeable_state) if (pr.mergeable_state == null || maybeReady.includes(pr.mergeable_state)) { const checkRuns = await github.request('GET /repos/{owner}/{repo}/commits/{ref}/check-runs', { owner: context.repo.owner, repo: context.repo.repo, ref: pr.head.sha }) // Iterator GitHub Checks in the PR for await (const cr of checkRuns.data.check_runs) { if (cr.name == 'Build' && cr.conclusion != "action_required") { // text contains parameters to make request in JSON. const params = JSON.parse(cr.output.text) // Get the workflow run in the forked repository let run try { run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params) } catch (error) { console.error(error) // Run not found. This can happen when the PR author removes GitHub Actions runs or // disalbes GitHub Actions. continue } // Keep syncing the status of the checks try { if (run.data.status == 'completed') { console.log(' Run ' + cr.id + ': set status (' + run.data.status + ') and conclusion (' + run.data.conclusion + ')') const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', { owner: context.repo.owner, repo: context.repo.repo, check_run_id: cr.id, output: cr.output, status: run.data.status, conclusion: run.data.conclusion, details_url: run.data.details_url }) } else { console.log(' Run ' + cr.id + ': set status (' + run.data.status + ')') const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', { owner: context.repo.owner, repo: context.repo.repo, check_run_id: cr.id, output: cr.output, status: run.data.status, details_url: run.data.details_url }) } } catch (error) { console.error(error) continue } break } } } } } ================================================ FILE: .gitignore ================================================ # Package Files # *.jar *.class *.zip *.tar.gz # see JDK-8214300 .attach_pid* # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* # build targets target/ # Log file *.log /logs logs.zip # Intellij Idea files .idea/ *.iml .idea/* .DS_Store metastore_db/ work_dir all-dependencies.txt self-modules.txt third-party-dependencies.txt *.keytab /derby.log dependency-reduced-pom.xml apidoc # Python *.py[cod] Test.java Test.scala test.conf spark-warehouse *.flattened-pom.xml seatunnel-examples # vscode .vscode /lib/* version.properties node/ dist/ seatunnel-engine/seatunnel-engine-server/**/ui/* ================================================ FILE: .gitmodules ================================================ [submodule ".github/actions/get-workflow-origin"] path = .github/actions/get-workflow-origin url = https://github.com/potiuk/get-workflow-origin.git [submodule ".github/actions/label-when-approved-action"] path = .github/actions/label-when-approved-action url = https://github.com/TobKed/label-when-approved-action ================================================ FILE: .licenserc.yaml ================================================ # Licensed to Apache Software Foundation (ASF) under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Apache Software Foundation (ASF) licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. header: license: spdx-id: Apache-2.0 copyright-owner: Apache Software Foundation paths-ignore: - seatunnel-dist - NOTICE - LICENSE - DISCLAIMER - mvnw.cmd - .mvn - .gitmodules - .gitattributes - .github/actions - '**/known-dependencies-*.txt' - '**/*.md' - '**/*.mdx' - '**/*.json' - '**/*.iml' - '**/*.ini' - '**/*.svg' - '**/*.txt' - '**/*.csv' - '**/.gitignore' - '**/LICENSE' - '**/NOTICE' - '**/.gitkeep' - '**/com/typesafe/config/**' - 'seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/ConfigProvider.java' - 'seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/SeaTunnelConfigSections.java' - 'seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigBuilder.java' - 'seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/utils/ExceptionUtil.java' - 'seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/operation/AsyncOperation.java' - 'seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/protocol/task/AbstractSeaTunnelMessageTask.java' - 'seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/utils/PassiveCompletableFuture.java' - 'seatunnel-connectors-v2/connector-cdc/connector-cdc-postgres/src/main/java/io/debezium/connector/postgresql/connection/PostgresReplicationConnection.java' - 'seatunnel-shade/seatunnel-hazelcast/seatunnel-hazelcast-shade/src/main/java/com/hazelcast/**' comment: on-failure ================================================ FILE: .mvn/wrapper/maven-wrapper.properties ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.4/apache-maven-3.8.4-bin.zip wrapperUrl=https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar ================================================ FILE: AGENTS.md ================================================ # LLM Context Guide for Apache SeaTunnel This guide helps AI assistants (LLMs / Agents) make **safe, consistent, and verifiable** changes to the Apache SeaTunnel codebase. It mirrors practices from mature Apache projects and adapts them to SeaTunnel’s **build, testing, architecture, and documentation conventions**. ## ⚠️ CRITICAL: Validate Before Proposing Changes **Agents MUST run verification commands locally before suggesting or finalizing changes.** ```bash # Format code (mandatory) ./mvnw spotless:apply # Quick verification (mandatory) ./mvnw -q -DskipTests verify # Unit tests (strongly recommended) ./mvnw test ``` Failure to meet these requirements will likely result in PR rejection. ## Git Commit Message Convention SeaTunnel follows a **strict commit message format** to maintain a clean and searchable history. **Format**: ``` [Type][Module] Description ``` ### Types * `Feature` – New features * `Fix` – Bug fixes * `Improve` – Improvements to existing behavior * `Docs` – Documentation-only changes * `Test` – Test cases or test framework changes * `Chore` – Build, dependency, or maintenance tasks ### Modules * `Connector-V2` – seatunnel-connectors-v2 * `Zeta` – seatunnel-engine (Zeta engine) * `Core` – seatunnel-core * `API` – seatunnel-api * `Transform-V2` – seatunnel-transforms-v2 * `Format` – seatunnel-formats * `Translation` – seatunnel-translation * `E2E` – seatunnel-e2e ### Examples * `[Fix][Connector-V2] Fix MySQL source split enumeration bug` * `[Fix][Zeta] Fix checkpoint timeout under heavy backpressure` * `[Feature][Transform-V2] Add LLM transform plugin` * `[Improve][Core] Optimize jar package loading speed` * `[Docs] Update quick start guide` ## Repository Structure ```text seatunnel/ ├── seatunnel-api/ # Core API definitions ├── seatunnel-connectors-v2/ # Source & Sink connectors (main contribution area) ├── seatunnel-transforms-v2/ # Transform plugins (including LLM) ├── seatunnel-engine/ # Zeta engine & Web UI ├── seatunnel-core/ # Job submission & CLI entry points ├── seatunnel-translation/ # Flink & Spark adapters ├── seatunnel-formats/ # Data formats (JSON, Avro, etc.) ├── seatunnel-e2e/ # End-to-End integration tests ├── docs/ # Documentation (en & zh) └── config/ # Default configurations ``` ## Code Standards ### Java Backend * **Formatting**: Google Java Format (AOSP style), enforced by Spotless * **Imports**: * No wildcard imports * Use shaded dependencies: `org.apache.seatunnel.shade.*` * **Nullability**: Avoid implicit null assumptions * **Visibility**: Keep APIs minimal; prefer package-private when possible * **Comments**: Add comments for important methods (public APIs, complex logic). Important methods include public APIs, lifecycle hooks (initialization, start/stop, checkpoint), and complex or performance-critical logic. Example: ```java /** * Enumerates source splits for parallel reading. * Called once during job initialization. * * @param context Split enumeration context * @return Collection of discovered splits */ @Override public List enumerateSplits(SplitEnumerationContext context) { // Implementation } ``` ### Apache License Header (MANDATORY) All **new files** MUST include the ASF license header: ```java /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ ``` ## 🚨 Backward Compatibility (VERY IMPORTANT) Agents MUST treat backward compatibility as a **hard constraint**. * DO NOT remove or rename existing config options * DO NOT change default values casually * DO NOT break public APIs or SPI contracts Any incompatible change MUST: * Be explicitly documented * Be documented in `docs/en/introduction/concepts/incompatible-changes.md` * Include migration guidance * Be clearly explained in the PR description ## Dependency Rules * DO NOT introduce new dependencies unless absolutely necessary * Prefer existing shaded dependencies under `org.apache.seatunnel.shade.*` * Any new dependency MUST: * Be justified in the PR description * Consider shading, size, and conflict risks ## Architecture Guidelines ### Connector (V2) * Implement `SeaTunnelSource` or `SeaTunnelSink` * Define configs using `Option` * Support parallelism via `SourceSplitEnumerator` * Avoid connector-specific logic leaking into engine or core ### Zeta Engine * **Client**: Submits job config * **Master**: Schedules & coordinates * **Worker**: Executes tasks (Source → Transform → Sink) Respect task boundaries and lifecycle semantics. ## Configuration (Option) Rules * All user-facing configs MUST be defined using `Option` * Each option MUST include: * name * type * default value (if applicable) * clear description * Option names are **stable contracts** and must not be renamed lightly ## Error Handling & Logging * Exceptions MUST include sufficient context (table, task, config key) * Avoid swallowing exceptions * Use proper log levels: * INFO – lifecycle events * WARN – recoverable issues * ERROR – task-failing errors * NEVER log sensitive information (passwords, tokens, credentials) ## Documentation Rules * Any user-visible change MUST update: * `docs/en` * `docs/zh` * Config names, defaults, and examples MUST match the code exactly * Documentation is part of the feature, not an afterthought ## Testing Guidelines ### Unit Tests * Located under `src/test/java` * Validate behavior, not implementation details * Prefer deterministic and minimal tests Command: ```bash ./mvnw test ``` ### E2E Tests * Located in `seatunnel-e2e` * Uses Testcontainers * Extend `TestSuiteBase` Command: ```bash ./mvnw -DskipUT -DskipIT=false verify ``` ## Performance Awareness Agents MUST consider performance implications: * Avoid unnecessary object creation in hot paths * Be cautious with large in-memory buffers * Consider parallelism and resource usage ## PR Scope Rule * Keep changes minimal and focused * Avoid unrelated refactors or formatting-only changes * One PR should solve **one problem** ## Running & Debugging ### Build from Source ```bash ./mvnw clean install -DskipTests -Dskip.spotless=true ``` ### Install Connectors ```bash sh bin/install-plugin.sh $current_version ``` ### Run Job (Zeta) ```bash sh bin/seatunnel.sh --config config/v2.batch.config.template -e local ``` ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ======================================================================== Apache 2.0 licenses ======================================================================== The following components are provided under the Apache License. See project link for details. The text of each license is the standard Apache 2.0 license. tools/dependencies/checkLicense.sh files from https://github.com/apache/skywalking mvnw files from https://github.com/apache/maven-wrapper Apache 2.0 seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/RowKind.java from https://github.com/apache/flink seatunnel-api/src/main/java/org/apache/seatunnel/api/state/CheckpointListener.java from https://github.com/apache/flink seatunnel-config/seatunnel-config-shade/src/main/java/org/apache/seatunnel/shade/com/typesafe/config/ from https://github.com/lightbend/config seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/ from https://github.com/apache/flink seatunnel-connectors-v2/connector-common/src/main/java/org/apache/seatunnel/connectors/seatunnel/common/source/reader/ from https://github.com/apache/flink seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/ from https://github.com/apache/iceberg seatunnel-connectors-v2/connector-cdc/connector-base/src/main/java/org/apache/seatunnel/connectors/cdc/base from https://github.com/ververica/flink-cdc-connectors seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql from https://github.com/ververica/flink-cdc-connectors seatunnel-connectors-v2/connector-cdc/connector-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium from https://github.com/ververica/flink-cdc-connectors seatunnel-connectors-v2/connector-cdc/connector-cdc-sqlserver/src/main/java/io/debezium/connector/sqlserver/SqlServerStreamingChangeEventSource.java from https://github.com/debezium/debezium seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb from https://github.com/ververica/flink-cdc-connectors seatunnel-connectors-v2/connector-cdc/connector-cdc-postgres/src/main/java/io/debezium/connector/postgresql/connection/PostgresReplicationConnection.java from https://github.com/debezium/debezium generate_client_protocol.sh from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/utils/ExceptionUtil.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/protocol/task/AbstractSeaTunnelMessageTask.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/operation/AsyncOperation.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/operation/AbstractJobAsyncOperation.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/ConfigProvider.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/SeaTunnelConfigSections.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigBuilder.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/job/JobStatus.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/execution/ExecutionState.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/utils/PassiveCompletableFuture.java from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointException.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/checkpoint/CheckpointIDCounter.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/checkpoint/InternalCheckpointListener.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/StandaloneCheckpointIDCounter.java from https://github.com/apache/flink seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/metrics from https://github.com/hazelcast/hazelcast seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics from https://github.com/hazelcast/hazelcast seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLEngine.java from https://github.com/JSQLParser/JSqlParser seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLType.java from https://github.com/JSQLParser/JSqlParser seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLFilter.java from https://github.com/JSQLParser/JSqlParser seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sqlengine/zeta/ZetaSQLFunction.java from https://github.com/JSQLParser/JSqlParser seatunnel-shade/seatunnel-hazelcast/seatunnel-hazelcast-shade/src/main/java/com/hazelcast/** from https://github.com/hazelcast/hazelcast seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/LiteNodeDropOutTcpIpJoiner.java from https://github.com/hazelcast/hazelcast ================================================ FILE: NOTICE ================================================ Apache SeaTunnel Copyright 2021-2024 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). // ------------------------------------------------------------------ // NOTICE file corresponding to the section 4d of The Apache License, // Version 2.0, in this case for Apache Flink // ------------------------------------------------------------------ Apache Flink Copyright 2006-2022 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). Flink : Connectors : JDBC Copyright 2014-2022 The Apache Software Foundation // ------------------------------------------------------------------ // NOTICE file corresponding to the section 4d of The Apache License, // Version 2.0, in this case for Apache Iceberg // ------------------------------------------------------------------ Apache Iceberg Copyright 2017-2022 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). Iceberg : Flink Copyright 2017-2022 The Apache Software Foundation // ------------------------------------------------------------------ // NOTICE file corresponding to the section 4d of The Apache License, // Version 2.0, in this case for Apache Iceberg // ------------------------------------------------------------------ ----------------------------------------------------------------------- This product contains code form the Apache Maven Wrapper Project: ----------------------------------------------------------------------- Apache Maven Wrapper Copyright 2013-2022 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). The original idea and initial implementation of the maven-wrapper module is derived from the Gradle Wrapper which was written originally by Hans Dockter and Adam Murdoch. Copyright 2007 the original author or authors. ----------------------------------------------------------------------- This product contains code form the Hazelcast Project: The packages: com.hazelcast.internal.util.collection com.hazelcast.internal.util.concurrent and the classes: com.hazelcast.internal.util.QuickMath com.hazelcast.client.impl.protocol.util.UnsafeBuffer com.hazelcast.client.impl.protocol.util.BufferBuilder contain code originating from the Agrona project (https://github.com/real-logic/Agrona). The class com.hazelcast.internal.util.HashUtil contains code originating from the Koloboke project (https://github.com/OpenHFT/Koloboke). The class classloading.ThreadLocalLeakTestUtils contains code originating from the Tomcat project (https://github.com/apache/tomcat). com.hazelcast.internal.cluster.fd.PhiAccrualFailureDetector contains code originating from the Akka project (https://github.com/akka/akka/). The package com.hazelcast.internal.json contains code originating from minimal-json project (https://github.com/ralfstx/minimal-json). The class com.hazelcast.instance.impl.MobyNames contains code originating from The Moby Project (https://github.com/moby/moby). The class com.hazelcast.internal.util.graph.BronKerboschCliqueFinder contains code originating from The JGraphT Project (https://github.com/jgrapht/jgrapht). The packages: com.hazelcast.sql com.hazelcast.jet.sql contain code originating from the Apache Calcite (https://github.com/apache/calcite) The class com.hazelcast.jet.kafka.impl.ResumeTransactionUtil contains code derived from the Apache Flink project. The class com.hazelcast.internal.util.ConcurrentReferenceHashMap contains code written by Doug Lea and updated within the WildFly project (https://github.com/wildfly/wildfly). The class org.apache.calcite.linq4j.tree.ConstantExpression contains code originating from the Calcite project (https://github.com/apache/calcite). Aerospike Sink Connector Copyright 2023 The original authors. Contains Aerospike Client Library (https://www.aerospike.com/) which is licensed under the AGPL 3.0 License (https://www.aerospike.com/terms/download/3rd-party-licenses) ================================================ FILE: README.md ================================================ # Apache SeaTunnel SeaTunnel Logo [![Build Workflow](https://github.com/apache/seatunnel/actions/workflows/build_main.yml/badge.svg?branch=dev)](https://github.com/apache/seatunnel/actions/workflows/build_main.yml) [![Join Slack](https://img.shields.io/badge/slack-%23seatunnel-4f8eba?logo=slack)](https://s.apache.org/seatunnel-slack) [![Twitter Follow](https://img.shields.io/twitter/follow/ASFSeaTunnel.svg?label=Follow&logo=twitter)](https://twitter.com/ASFSeaTunnel) [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/apache/seatunnel) ## Overview SeaTunnel is a multimodal, high-performance, distributed data integration tool, capable of synchronizing vast amounts of data daily. It's trusted by numerous companies for its efficiency and stability. ## Why Choose SeaTunnel SeaTunnel addresses common data integration challenges: - **Diverse Data Sources**: Seamlessly integrates with hundreds of evolving data sources. - **Multimodal Data Integration**: Supports the integration of video, images, binary files, structured and unstructured text data. - **Complex Synchronization Scenarios**: Supports various synchronization methods, including real-time, CDC, and full database synchronization. - **Resource Efficiency**: Minimizes computing resources and JDBC connections for real-time synchronization. - **Quality and Monitoring**: Provides data quality and monitoring to prevent data loss or duplication. ## Key Features - **Diverse Connectors**: Offers support for over 160 connectors, with ongoing expansion. - **Batch-Stream Integration**: Easily adaptable connectors simplify data integration management. - **Distributed Snapshot Algorithm**: Ensures data consistency across synchronized data. - **Multi-Engine Support**: Works with SeaTunnel Zeta Engine, Flink, and Spark. - **JDBC Multiplexing and Log Parsing**: Efficiently synchronizes multi-tables and databases. - **High Throughput and Low Latency**: Provides high-throughput data synchronization with low latency. - **Real-Time Monitoring**: Offers detailed insights during synchronization. ## SeaTunnel Workflow ![SeaTunnel Workflow](docs/images/architecture_diagram.png) Configure jobs, select execution engines, and parallelize data using Source Connectors. Easily develop and extend connectors to meet your needs. ## Supported Connectors - [Source Connectors](https://seatunnel.apache.org/docs/connectors/source) - [Sink Connectors](https://seatunnel.apache.org/docs/connectors/sink) - [Transform Connectors](https://seatunnel.apache.org/docs/transforms) ## Getting Started Download SeaTunnel from the [Official Website](https://seatunnel.apache.org/download). Choose your runtime execution engine: - [SeaTunnel Zeta Engine](https://seatunnel.apache.org/docs/getting-started/locally/quick-start-seatunnel-engine) - [Spark](https://seatunnel.apache.org/docs/getting-started/locally/quick-start-spark) - [Flink](https://seatunnel.apache.org/docs/getting-started/locally/quick-start-flink) ## Multimodal Data Integration - Most data integration tools support structured and unstructured text data, and SeaTunnel does as well. Simply refer to the desired Source/Sink to use. - For integrating video, images, and binary files with SeaTunnel, please refer to the documentation for detailed instructions. ## Apache SeaTunnel Tools SeaTunnel Tools provides a range of peripheral tools, including Apache SeaTunnel Mcp Server, etc,please refer to [SeaTunnel Tools](https://github.com/apache/seatunnel-tools). ## Users Companies and organizations worldwide use SeaTunnel for research, production, and commercial products. Explore real-world use cases of SeaTunnel, such as JP morgan, S7, JDT, Bytedance, Tencent Cloud. More use cases can be found on the [SeaTunnel Users](https://seatunnel.apache.org/user). ## Code of Conduct Participate in this project in accordance with the Contributor Covenant [Code of Conduct](https://www.apache.org/foundation/policies/conduct). ## Contributors We appreciate all developers for their contributions. See the [List Of Contributors](https://github.com/apache/seatunnel/graphs/contributors). ## How to Compile Refer to this [Setup](https://seatunnel.apache.org/docs/developer/setup) for compilation instructions. ## Contact Us - Mail list: **dev@seatunnel.apache.org**. Subscribe by sending an email to `dev-subscribe@seatunnel.apache.org`. - Slack: [Join SeaTunnel Slack](https://s.apache.org/seatunnel-slack) - Twitter: [ASFSeaTunnel on Twitter](https://twitter.com/ASFSeaTunnel) ## Landscapes SeaTunnel enriches the [CNCF CLOUD NATIVE Landscape](https://landscape.cncf.io/?landscape=observability-and-analysis&license=apache-license-2-0). ## License [Apache 2.0 License](LICENSE) ## Frequently Asked Questions ### 1. How do I install SeaTunnel? Follow the [Local Deployment](https://seatunnel.apache.org/docs/getting-started/locally/deployment) on SeaTunnel website to get started quickly. Please refer to the [Cluster Deployment](https://seatunnel.apache.org/docs/engines/zeta/hybrid-cluster-deployment) ### 2. Where can I find documentation and tutorials? [Official Documentation](https://seatunnel.apache.org/docs) includes detailed guides and tutorials to help you get started. ### 3. Is there a community or support channel? You can submit an issue on [GitHub Issues](https://github.com/apache/seatunnel/issues). Join our Slack community [SeaTunnel Slack](https://s.apache.org/seatunnel-slack). More information, please refer to [FAQ](https://seatunnel.apache.org/docs/faq). ### 4. How can I contribute to SeaTunnel? We welcome contributions! Please refer to our [Contribution Guidelines](https://seatunnel.apache.org/docs/developer/coding-guide) for details. ================================================ FILE: bin/install-plugin.cmd ================================================ @echo off REM Licensed to the Apache Software Foundation (ASF) under one or more REM contributor license agreements. See the NOTICE file distributed with REM this work for additional information regarding copyright ownership. REM The ASF licenses this file to You under the Apache License, Version 2.0 REM (the "License"); you may not use this file except in compliance with REM the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM This script is used to download the connector plug-ins required during the running process. REM All are downloaded by default. You can also choose what you need. REM You only need to configure the plug-in name in config\plugin_config.txt. REM Get seatunnel home set "SEATUNNEL_HOME=%~dp0..\" echo Set SEATUNNEL_HOME to [%SEATUNNEL_HOME%] REM Connector default version is 3.0.0, you can also choose a custom version. eg: 3.0.0: install-plugin.bat 3.0.0 set "version=3.0.0" if not "%~1"=="" set "version=%~1" REM Create the lib directory if not exist "%SEATUNNEL_HOME%\lib" ( mkdir "%SEATUNNEL_HOME%\lib" echo create lib directory ) echo Install SeaTunnel connectors plugins, usage version is %version% REM Create the connectors directory if not exist "%SEATUNNEL_HOME%\connectors" ( mkdir "%SEATUNNEL_HOME%\connectors" echo create connectors directory ) for /f "usebackq delims=" %%a in ("%SEATUNNEL_HOME%\config\plugin_config") do ( set "line=%%a" setlocal enabledelayedexpansion if "!line:~0,1!" neq "-" if "!line:~0,1!" neq "#" ( echo install connector : !line! call "%SEATUNNEL_HOME%\mvnw.cmd" dependency:get -Dtransitive=false -DgroupId="org.apache.seatunnel" -DartifactId="!line!" -Dversion="%version%" -Ddest="%SEATUNNEL_HOME%\connectors" ) endlocal ) ================================================ FILE: bin/install-plugin.sh ================================================ #!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # #This script is used to download the connector plug-ins required during the running process. #All are downloaded by default. You can also choose what you need. #You only need to configure the plug-in name in config/plugin_config. # get seatunnel home SEATUNNEL_HOME=$(cd $(dirname $0);cd ../;pwd) # connector default version is 3.0.0, you can also choose a custom version. eg: 3.0.0: sh install-plugin.sh 3.0.0 version=3.0.0 if [ -n "$1" ]; then version="$1" fi echo "Install SeaTunnel connectors plugins, usage version is ${version}" # create the connectors directory if [ ! -d ${SEATUNNEL_HOME}/connectors ]; then mkdir ${SEATUNNEL_HOME}/connectors echo "create connectors directory" fi while read line; do first_char=$(echo "$line" | cut -c 1) if [ "$first_char" != "-" ] && [ "$first_char" != "#" ] && [ ! -z $first_char ] then echo "install connector : " $line ${SEATUNNEL_HOME}/mvnw dependency:get -Dtransitive=false -DgroupId=org.apache.seatunnel -DartifactId=${line} -Dversion=${version} -Ddest=${SEATUNNEL_HOME}/connectors fi done < ${SEATUNNEL_HOME}/config/plugin_config ================================================ FILE: config/hazelcast-client.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 connection-strategy: connection-retry: cluster-connect-timeout-millis: 3000 network: cluster-members: - localhost:5801 ================================================ FILE: config/hazelcast-master.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast: cluster-name: seatunnel network: rest-api: enabled: false endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost:5801 - localhost:5802 port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ================================================ FILE: config/hazelcast-worker.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast: cluster-name: seatunnel network: join: tcp-ip: enabled: true member-list: - localhost:5801 - localhost:5802 port: auto-increment: false port: 5802 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ================================================ FILE: config/hazelcast.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast: cluster-name: seatunnel network: rest-api: enabled: false endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ================================================ FILE: config/jvm_client_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap -Xms256m -Xmx512m # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-client ================================================ FILE: config/jvm_master_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap # -Xms2g # -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ================================================ FILE: config/jvm_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap # -Xms2g # -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC # GC Logging # Uncomment the following options to enable GC logging for troubleshooting and performance analysis. # The GC log directory will be automatically created on startup if it doesn't exist. # -XX:+PrintGCDetails # -XX:+PrintGCDateStamps # -XX:+PrintGCTimeStamps # -Xloggc:/tmp/seatunnel/gc/gc.log # -XX:+UseGCLogFileRotation # -XX:NumberOfGCLogFiles=10 # -XX:GCLogFileSize=200M # -XX:+PrintGCApplicationStoppedTime ================================================ FILE: config/jvm_worker_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap # -Xms2g # -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ================================================ FILE: config/log4j2.properties ================================================ ################################################################################ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ # The minimum amount of time, in seconds, that must elapse before the file configuration is checked for changes. monitorInterval = 60 property.file_path = ${sys:seatunnel.logs.path:-/tmp/seatunnel/logs} property.file_name = ${sys:seatunnel.logs.file_name:-seatunnel} property.file_split_size = 100MB property.file_count = 100 property.file_ttl = 7d rootLogger.level = INFO logger.zeta.name=org.apache.seatunnel.engine logger.zeta.level=INFO logger.debezium.name=io.debezium.connector logger.debezium.level=WARN ############################ log output to console ############################# #rootLogger.appenderRef.consoleStdout.ref = consoleStdoutAppender #rootLogger.appenderRef.consoleStderr.ref = consoleStderrAppender ############################ log output to console ############################# ############################ log output to file ############################# rootLogger.appenderRef.file.ref = fileAppender ############################ log output to file ############################# appender.consoleStdout.name = consoleStdoutAppender appender.consoleStdout.type = CONSOLE appender.consoleStdout.target = SYSTEM_OUT appender.consoleStdout.layout.type = PatternLayout appender.consoleStdout.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStdout.filter.acceptLtWarn.type = ThresholdFilter appender.consoleStdout.filter.acceptLtWarn.level = WARN appender.consoleStdout.filter.acceptLtWarn.onMatch = DENY appender.consoleStdout.filter.acceptLtWarn.onMismatch = ACCEPT appender.consoleStderr.name = consoleStderrAppender appender.consoleStderr.type = CONSOLE appender.consoleStderr.target = SYSTEM_ERR appender.consoleStderr.layout.type = PatternLayout appender.consoleStderr.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStderr.filter.acceptGteWarn.type = ThresholdFilter appender.consoleStderr.filter.acceptGteWarn.level = WARN appender.consoleStderr.filter.acceptGteWarn.onMatch = ACCEPT appender.consoleStderr.filter.acceptGteWarn.onMismatch = DENY appender.routing.name = routingAppender appender.routing.type = Routing appender.routing.purge.type = IdlePurgePolicy appender.routing.purge.timeToLive = 60 appender.routing.purge.checkInterval = 1 appender.routing.route.type = Routes appender.routing.route.pattern = $${ctx:ST-JID} appender.routing.route.system.type = Route appender.routing.route.system.key = $${ctx:ST-JID} appender.routing.route.system.ref = fileAppender appender.routing.route.job.type = Route appender.routing.route.job.appender.type = File appender.routing.route.job.appender.name = job-${ctx:ST-JID} appender.routing.route.job.appender.fileName = ${file_path}/job-${ctx:ST-JID}.log appender.routing.route.job.appender.layout.type = PatternLayout appender.routing.route.job.appender.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.file.name = fileAppender appender.file.type = RollingFile appender.file.fileName = ${file_path}/${file_name}.log appender.file.filePattern = ${file_path}/${file_name}.log.%d{yyyy-MM-dd}-%i appender.file.append = true appender.file.layout.type = PatternLayout appender.file.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.file.policies.type = Policies appender.file.policies.time.type = TimeBasedTriggeringPolicy appender.file.policies.time.modulate = true appender.file.policies.size.type = SizeBasedTriggeringPolicy appender.file.policies.size.size = ${file_split_size} appender.file.strategy.type = DefaultRolloverStrategy appender.file.strategy.fileIndex = nomax appender.file.strategy.action.type = Delete appender.file.strategy.action.basepath = ${file_path} appender.file.strategy.action.maxDepth = 1 appender.file.strategy.action.condition.type = IfFileName appender.file.strategy.action.condition.glob = ${file_name}.log* appender.file.strategy.action.condition.nested_condition.type = IfAny appender.file.strategy.action.condition.nested_condition.lastModify.type = IfLastModified appender.file.strategy.action.condition.nested_condition.lastModify.age = ${file_ttl} appender.file.strategy.action.condition.nested_condition.fileCount.type = IfAccumulatedFileCount appender.file.strategy.action.condition.nested_condition.fileCount.exceeds = ${file_count} ================================================ FILE: config/log4j2_client.properties ================================================ ################################################################################ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ # The minimum amount of time, in seconds, that must elapse before the file configuration is checked for changes. monitorInterval = 60 property.file_path = ${sys:seatunnel.logs.path:-/tmp/seatunnel/logs} property.file_name = ${sys:seatunnel.logs.file_name:-seatunnel} property.file_split_size = 100MB property.file_count = 100 property.file_ttl = 7d rootLogger.level = INFO ############################ log output to console ############################# rootLogger.appenderRef.consoleStdout.ref = consoleStdoutAppender rootLogger.appenderRef.consoleStderr.ref = consoleStderrAppender ############################ log output to console ############################# ############################ log output to file ############################# #rootLogger.appenderRef.file.ref = fileAppender ############################ log output to file ############################# appender.consoleStdout.name = consoleStdoutAppender appender.consoleStdout.type = CONSOLE appender.consoleStdout.target = SYSTEM_OUT appender.consoleStdout.layout.type = PatternLayout appender.consoleStdout.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStdout.filter.acceptLtWarn.type = ThresholdFilter appender.consoleStdout.filter.acceptLtWarn.level = WARN appender.consoleStdout.filter.acceptLtWarn.onMatch = DENY appender.consoleStdout.filter.acceptLtWarn.onMismatch = ACCEPT appender.consoleStderr.name = consoleStderrAppender appender.consoleStderr.type = CONSOLE appender.consoleStderr.target = SYSTEM_ERR appender.consoleStderr.layout.type = PatternLayout appender.consoleStderr.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStderr.filter.acceptGteWarn.type = ThresholdFilter appender.consoleStderr.filter.acceptGteWarn.level = WARN appender.consoleStderr.filter.acceptGteWarn.onMatch = ACCEPT appender.consoleStderr.filter.acceptGteWarn.onMismatch = DENY #appender.file.name = fileAppender #appender.file.type = RollingFile #appender.file.fileName = ${file_path}/${file_name}.log #appender.file.filePattern = ${file_path}/${file_name}.log.%d{yyyy-MM-dd}-%i #appender.file.append = true #appender.file.layout.type = PatternLayout #appender.file.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n #appender.file.policies.type = Policies #appender.file.policies.time.type = TimeBasedTriggeringPolicy #appender.file.policies.time.modulate = true #appender.file.policies.size.type = SizeBasedTriggeringPolicy #appender.file.policies.size.size = ${file_split_size} #appender.file.strategy.type = DefaultRolloverStrategy #appender.file.strategy.fileIndex = nomax #appender.file.strategy.action.type = Delete #appender.file.strategy.action.basepath = ${file_path} #appender.file.strategy.action.maxDepth = 1 #appender.file.strategy.action.condition.type = IfFileName #appender.file.strategy.action.condition.glob = ${file_name}.log* #appender.file.strategy.action.condition.nested_condition.type = IfAny #appender.file.strategy.action.condition.nested_condition.lastModify.type = IfLastModified #appender.file.strategy.action.condition.nested_condition.lastModify.age = ${file_ttl} #appender.file.strategy.action.condition.nested_condition.fileCount.type = IfAccumulatedFileCount #appender.file.strategy.action.condition.nested_condition.fileCount.exceeds = ${file_count} ================================================ FILE: config/plugin_config ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # This mapping is used to resolve the Jar package name without version (or call artifactId) # # corresponding to the module in the user Config, helping SeaTunnel to load the correct Jar package. # Don't modify the delimiter " -- ", just select the plugin you need --connectors-v2-- connector-amazondynamodb connector-assert connector-cassandra connector-cdc-mysql connector-cdc-mongodb connector-cdc-sqlserver connector-cdc-postgres connector-cdc-oracle connector-cdc-tidb connector-clickhouse connector-datahub connector-databend connector-dingtalk connector-doris connector-elasticsearch connector-email connector-file-ftp connector-file-hadoop connector-file-local connector-file-oss connector-file-jindo-oss connector-file-s3 connector-file-sftp connector-file-obs connector-google-sheets connector-google-firestore connector-graphql connector-hive connector-http-base connector-http-feishu connector-http-gitlab connector-http-github connector-http-jira connector-http-klaviyo connector-http-lemlist connector-http-myhours connector-http-notion connector-http-onesignal connector-http-wechat connector-http-airtable connector-hudi connector-iceberg connector-influxdb connector-iotdb connector-jdbc connector-kafka connector-kudu connector-maxcompute connector-mongodb connector-neo4j connector-openmldb connector-pulsar connector-rabbitmq connector-redis connector-druid connector-s3-redshift connector-sentry connector-slack connector-socket connector-starrocks connector-tablestore connector-selectdb-cloud connector-hbase connector-amazonsqs connector-easysearch connector-paimon connector-rocketmq connector-tdengine connector-web3j connector-milvus connector-activemq connector-prometheus connector-sls connector-qdrant connector-typesense connector-cdc-opengauss connector-sensorsdata connector-hugegraph connector-lance ================================================ FILE: config/seatunnel-env.cmd ================================================ @echo off REM Licensed to the Apache Software Foundation (ASF) under one or more REM contributor license agreements. See the NOTICE file distributed with REM this work for additional information regarding copyright ownership. REM The ASF licenses this file to You under the Apache License, Version 2.0 REM (the "License"); you may not use this file except in compliance with REM the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM Home directory of spark distribution. if "%SPARK_HOME%" == "" set "SPARK_HOME=C:\Program Files\spark" REM Home directory of flink distribution. if "%FLINK_HOME%" == "" set "FLINK_HOME=C:\Program Files\flink" REM Whether to enable metalake (true/false). if "%METALAKE_ENABLED%" == "" set "META_LAKE_ENABLED=false" REM Type of metalake implementation. if "%METALAKE_TYPE%" == "" set "METALAKE_TYPE=gravitino" REM Metalake service URL, format: http://host:port/api/metalakes/{metalake_name}/catalogs/ if "%METALAKE_URL%" == "" set "METALAKE_URL=http://localhost:8090/api/metalakes/default_metalake_name/catalogs/" ================================================ FILE: config/seatunnel-env.sh ================================================ #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Home directory of spark distribution. SPARK_HOME=${SPARK_HOME:-/opt/spark} # Home directory of flink distribution. FLINK_HOME=${FLINK_HOME:-/opt/flink} # Whether to enable metalake (true/false). METALAKE_ENABLED=${METALAKE_ENABLED:-false} # Type of metalake implementation. METALAKE_TYPE=${METALAKE_TYPE:-gravitino} # Metalake service URL, format: http://host:port/api/metalakes/{metalake_name}/catalogs/. METALAKE_URL=${METALAKE_URL:-http://localhost:8090/api/metalakes/default_metalake_name/catalogs/} ================================================ FILE: config/seatunnel.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # seatunnel: engine: classloader-cache-mode: true history-job-expire-minutes: 1440 backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 print-job-metrics-info-interval: 60 slot-service: dynamic-slot: true checkpoint: interval: 10000 timeout: 60000 storage: type: hdfs max-retained: 3 plugin-config: namespace: /tmp/seatunnel/checkpoint_snapshot storage.type: hdfs fs.defaultFS: file:///tmp/ # Ensure that the directory has written permission telemetry: metric: enabled: false logs: scheduled-deletion-enable: true http: enable-http: true port: 8080 enable-dynamic-port: false # Uncomment the following lines to enable basic authentication for web UI # enable-basic-auth: true # basic-auth-username: admin # basic-auth-password: admin ================================================ FILE: config/v2.batch.config.template ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ###### ###### This config file is a demonstration of streaming processing in SeaTunnel config ###### env { # You can set SeaTunnel environment configuration here parallelism = 2 job.mode = "BATCH" checkpoint.interval = 10000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure SeaTunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } sink { Console { } # If you would like to get more information about how to configure SeaTunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ================================================ FILE: config/v2.streaming.conf.template ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ###### ###### This config file is a demonstration of streaming processing in SeaTunnel config ###### env { # You can set SeaTunnel environment configuration here parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 2000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure SeaTunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } sink { Console { } # If you would like to get more information about how to configure SeaTunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ================================================ FILE: deploy/kubernetes/seatunnel/Chart.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # apiVersion: v2 name: seatunnel-helm description: SeaTunnel is a next-generation, high-performance, distributed data integration tool, capable of synchronizing vast amounts of data daily. It's trusted by numerous companies for its efficiency and stability. home: https://seatunnel.apache.org icon: https://seatunnel.apache.org/image/logo.png keywords: - seatunnel - integration # A chart can be either an 'application' or a 'library' chart. # # Application charts are a collection of templates that can be packaged into versioned archives # to be deployed. # # Library charts provide useful utilities or functions for the chart developer. They're included as # a dependency of application charts to inject those utilities and functions into the rendering # pipeline. Library charts do not define any templates and therefore cannot be deployed. type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. version: 2.3.10 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. appVersion: 2.3.10 ================================================ FILE: deploy/kubernetes/seatunnel/conf/hazelcast-client.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast-client: cluster-name: {{ include "seatunnel.fullname" . }} properties: hazelcast.logging.type: log4j2 connection-strategy: connection-retry: cluster-connect-timeout-millis: 3000 network: cluster-members: - {{ include "seatunnel.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:5801 ================================================ FILE: deploy/kubernetes/seatunnel/conf/hazelcast-master.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast: cluster-name: {{ include "seatunnel.fullname" . }} network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: kubernetes: enabled: true service-dns: {{ include "seatunnel.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local service-port: 5801 port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ================================================ FILE: deploy/kubernetes/seatunnel/conf/hazelcast-worker.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # hazelcast: cluster-name: {{ include "seatunnel.fullname" . }} network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: kubernetes: enabled: true service-dns: {{ include "seatunnel.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local service-port: 5801 port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 member-attributes: rule: type: string value: worker ================================================ FILE: deploy/kubernetes/seatunnel/conf/jvm_client_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap -Xms256m -Xmx512m # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-client ================================================ FILE: deploy/kubernetes/seatunnel/conf/jvm_master_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap # -Xms2g # -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ================================================ FILE: deploy/kubernetes/seatunnel/conf/jvm_worker_options ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # JVM Heap # -Xms2g # -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ================================================ FILE: deploy/kubernetes/seatunnel/conf/log4j2.properties ================================================ ################################################################################ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ # The minimum amount of time, in seconds, that must elapse before the file configuration is checked for changes. monitorInterval = 60 property.file_path = ${sys:seatunnel.logs.path:-/tmp/seatunnel/logs} property.file_name = ${sys:seatunnel.logs.file_name:-seatunnel} property.file_split_size = 100MB property.file_count = 100 property.file_ttl = 7d rootLogger.level = INFO logger.zeta.name=org.apache.seatunnel.engine logger.zeta.level=INFO logger.debezium.name=io.debezium.connector logger.debezium.level=WARN ############################ log output to console ############################# rootLogger.appenderRef.consoleStdout.ref = consoleStdoutAppender rootLogger.appenderRef.consoleStderr.ref = consoleStderrAppender ############################ log output to console ############################# ############################ log output to file ############################# #rootLogger.appenderRef.file.ref = fileAppender ############################ log output to file ############################# appender.consoleStdout.name = consoleStdoutAppender appender.consoleStdout.type = CONSOLE appender.consoleStdout.target = SYSTEM_OUT appender.consoleStdout.layout.type = PatternLayout appender.consoleStdout.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStdout.filter.acceptLtWarn.type = ThresholdFilter appender.consoleStdout.filter.acceptLtWarn.level = WARN appender.consoleStdout.filter.acceptLtWarn.onMatch = DENY appender.consoleStdout.filter.acceptLtWarn.onMismatch = ACCEPT appender.consoleStderr.name = consoleStderrAppender appender.consoleStderr.type = CONSOLE appender.consoleStderr.target = SYSTEM_ERR appender.consoleStderr.layout.type = PatternLayout appender.consoleStderr.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.consoleStderr.filter.acceptGteWarn.type = ThresholdFilter appender.consoleStderr.filter.acceptGteWarn.level = WARN appender.consoleStderr.filter.acceptGteWarn.onMatch = ACCEPT appender.consoleStderr.filter.acceptGteWarn.onMismatch = DENY appender.routing.name = routingAppender appender.routing.type = Routing appender.routing.purge.type = IdlePurgePolicy appender.routing.purge.timeToLive = 60 appender.routing.route.type = Routes appender.routing.route.pattern = $${ctx:ST-JID} appender.routing.route.system.type = Route appender.routing.route.system.key = $${ctx:ST-JID} appender.routing.route.system.ref = fileAppender appender.routing.route.job.type = Route appender.routing.route.job.appender.type = File appender.routing.route.job.appender.name = job-${ctx:ST-JID} appender.routing.route.job.appender.fileName = ${file_path}/job-${ctx:ST-JID}.log appender.routing.route.job.appender.layout.type = PatternLayout appender.routing.route.job.appender.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.file.name = fileAppender appender.file.type = RollingFile appender.file.fileName = ${file_path}/${file_name}.log appender.file.filePattern = ${file_path}/${file_name}.log.%d{yyyy-MM-dd}-%i appender.file.append = true appender.file.layout.type = PatternLayout appender.file.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n appender.file.policies.type = Policies appender.file.policies.time.type = TimeBasedTriggeringPolicy appender.file.policies.time.modulate = true appender.file.policies.size.type = SizeBasedTriggeringPolicy appender.file.policies.size.size = ${file_split_size} appender.file.strategy.type = DefaultRolloverStrategy appender.file.strategy.fileIndex = nomax appender.file.strategy.action.type = Delete appender.file.strategy.action.basepath = ${file_path} appender.file.strategy.action.maxDepth = 1 appender.file.strategy.action.condition.type = IfFileName appender.file.strategy.action.condition.glob = ${file_name}.log* appender.file.strategy.action.condition.nested_condition.type = IfAny appender.file.strategy.action.condition.nested_condition.lastModify.type = IfLastModified appender.file.strategy.action.condition.nested_condition.lastModify.age = ${file_ttl} appender.file.strategy.action.condition.nested_condition.fileCount.type = IfAccumulatedFileCount appender.file.strategy.action.condition.nested_condition.fileCount.exceeds = ${file_count} ================================================ FILE: deploy/kubernetes/seatunnel/conf/seatunnel.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # seatunnel: engine: history-job-expire-minutes: 1440 backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 print-job-metrics-info-interval: 60 classloader-cache-mode: true slot-service: dynamic-slot: true http: enable-http: true port: 8080 enable-dynamic-port: false port-range: 100 checkpoint: interval: 300000 timeout: 10000 storage: type: hdfs max-retained: 3 plugin-config: namespace: /tmp/seatunnel/checkpoint_snapshot/ storage.type: hdfs fs.defaultFS: file:///tmp/ telemetry: metric: enabled: true ================================================ FILE: deploy/kubernetes/seatunnel/templates/NOTES.txt ================================================ {{/* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */}} ** Please be patient while the chart seatunnel {{ .Chart.AppVersion }} is being deployed ** Access seatunnel UI URL by: {{- if .Values.ingress.enabled }} seatunnel restapi URL for running jobs: http{{ if .Values.ingress.tls.enabled }}s{{ end }}://{{ .Values.ingress.host }}/running-jobs seatunnel restapi URL for system monitoring information: http{{ if .Values.ingress.tls.enabled }}s{{ end }}://{{ .Values.ingress.host }}/system-monitoring-information For more restapi please refer to https://seatunnel.apache.org/docs/{{ .Chart.AppVersion }}/seatunnel-engine/rest-api-v2 {{- else }} kubectl port-forward -n {{ .Release.Namespace }} svc/{{ template "seatunnel.fullname" . }}-master 8080:8080 seatunnel restapi URL for running jobs: http://127.0.0.1:8080/running-jobs seatunnel restapi URL for system monitoring information: http://127.0.0.1:8080/system-monitoring-information For more restapi please refer to https://seatunnel.apache.org/docs/{{ .Chart.AppVersion }}/seatunnel-engine/rest-api-v2 {{- end }} Or you can just go into master pod, and use local curl command. MASTER_POD=$(kubectl get po -l 'app.kubernetes.io/name=seatunnel-master' | sed '1d' | awk '{print $1}' | head -n1) kubectl -n {{ .Release.Namespace }} exec -it $MASTER_POD -- /bin/bash curl http://127.0.0.1:8080/running-jobs curl http://127.0.0.1:8080/system-monitoring-information ================================================ FILE: deploy/kubernetes/seatunnel/templates/_helpers.tpl ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # {{/* vim: set filetype=mustache: */}} {{/* Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). If release name contains chart name it will be used as a full name. */}} {{- define "seatunnel.fullname" -}} {{- .Release.Name | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* Create default docker images' fullname. */}} {{- define "seatunnel.image.fullname.master" -}} {{- .Values.image.registry }}:{{ .Values.image.tag | default .Chart.AppVersion -}} {{- end -}} {{- define "seatunnel.image.fullname.worker" -}} {{- .Values.image.registry }}:{{ .Values.image.tag | default .Chart.AppVersion -}} {{- end -}} {{/* Create a default common labels. */}} {{- define "seatunnel.common.labels" -}} app.kubernetes.io/instance: {{ .Release.Name }} app.kubernetes.io/managed-by: {{ .Release.Service }} app.kubernetes.io/version: {{ .Chart.AppVersion }} {{- end -}} {{/* Create a master labels. */}} {{- define "seatunnel.master.labels" -}} app.kubernetes.io/name: {{ include "seatunnel.fullname" . }}-master app.kubernetes.io/component: master {{ include "seatunnel.common.labels" . }} {{- end -}} {{/* Create a worker labels. */}} {{- define "seatunnel.worker.labels" -}} app.kubernetes.io/name: {{ include "seatunnel.fullname" . }}-worker app.kubernetes.io/component: worker {{ include "seatunnel.common.labels" . }} {{- end -}} {{/* Get the ConfigMap name - either existing or the one to be created. */}} {{- define "seatunnel.configMapName" -}} {{- if .Values.configMap.create -}} {{- include "seatunnel.fullname" . }}-configs {{- else -}} {{- .Values.configMap.existingConfigMapName }} {{- end -}} {{- end -}} ================================================ FILE: deploy/kubernetes/seatunnel/templates/configmap.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # {{- if .Values.configMap.create }} apiVersion: v1 kind: ConfigMap metadata: name: {{ include "seatunnel.fullname" . }}-configs labels: app.kubernetes.io/name: {{ include "seatunnel.fullname" . }}-configs {{- include "seatunnel.master.labels" . | nindent 4 }} data: {{- range $path, $_ := .Files.Glob "conf/*" }} {{- base $path | nindent 2 }}: |- {{- tpl ($.Files.Get $path) $ | nindent 4 -}} {{- end }} {{- end }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/deployment-seatunnel-master.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "seatunnel.fullname" . }}-master labels: {{- include "seatunnel.master.labels" . | nindent 4 }} spec: {{- if .Values.master.strategy }} strategy: {{- toYaml .Values.master.strategy | nindent 4 }} {{- end }} replicas: {{ .Values.master.replicas }} selector: matchLabels: {{- include "seatunnel.master.labels" . | nindent 6 }} template: metadata: {{- if .Values.master.annotations }} annotations: {{- toYaml .Values.master.annotations | nindent 8 }} {{- end }} labels: {{- include "seatunnel.master.labels" . | nindent 8 }} spec: serviceAccountName: {{ template "seatunnel.fullname" . }} {{- if .Values.master.affinity }} affinity: {{- toYaml .Values.master.affinity | nindent 8 }} {{- end }} {{- if .Values.master.nodeSelector }} nodeSelector: {{- toYaml .Values.master.nodeSelector | nindent 8 }} {{- end }} {{- if .Values.master.tolerations }} tolerations: {{- toYaml .Values.master.tolerations | nindent 8 }} {{- end }} {{- if .Values.image.pullSecret }} imagePullSecrets: - name: {{ .Values.image.pullSecret }} {{- end }} containers: - name: {{ include "seatunnel.fullname" . }}-master image: {{ include "seatunnel.image.fullname.master" . }} imagePullPolicy: {{ .Values.image.pullPolicy }} ports: - containerPort: 5801 name: "hazelcast-port" - containerPort: 8080 name: "master-port" {{- if .Values.master.command }} command: {{ .Values.master.command }} {{- else }} command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel-cluster.sh -r master"] {{- end }} {{- if .Values.master.resources }} resources: {{- toYaml .Values.master.resources | nindent 12 }} {{- end }} {{- if .Values.master.livenessProbe.enabled }} livenessProbe: {{- toYaml .Values.master.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.env }} env: {{- toYaml .Values.env | nindent 12 }} {{- end }} volumeMounts: # config mount {{- range $path, $_ := .Files.Glob "conf/*" }} - name: seatunnel-configs mountPath: /opt/seatunnel/config/{{ base $path }} subPath: {{ base $path }} {{- end }} volumes: - name: seatunnel-configs configMap: name: {{ include "seatunnel.configMapName" . }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/deployment-seatunnel-worker.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # apiVersion: apps/v1 kind: Deployment metadata: name: {{ include "seatunnel.fullname" . }}-worker labels: {{- include "seatunnel.worker.labels" . | nindent 4 }} spec: {{- if .Values.worker.strategy }} strategy: {{- toYaml .Values.worker.strategy | nindent 4 }} {{- end }} replicas: {{ .Values.worker.replicas }} selector: matchLabels: {{- include "seatunnel.worker.labels" . | nindent 6 }} template: metadata: {{- if .Values.worker.annotations }} annotations: {{- toYaml .Values.worker.annotations | nindent 8 }} {{- end }} labels: {{- include "seatunnel.worker.labels" . | nindent 8 }} spec: serviceAccountName: {{ template "seatunnel.fullname" . }} {{- if .Values.worker.affinity }} affinity: {{- toYaml .Values.worker.affinity | nindent 8 }} {{- end }} {{- if .Values.worker.nodeSelector }} nodeSelector: {{- toYaml .Values.worker.nodeSelector | nindent 8 }} {{- end }} {{- if .Values.worker.tolerations }} tolerations: {{- toYaml .Values.worker.tolerations | nindent 8 }} {{- end }} {{- if .Values.image.pullSecret }} imagePullSecrets: - name: {{ .Values.image.pullSecret }} {{- end }} containers: - name: {{ include "seatunnel.fullname" . }}-worker image: {{ include "seatunnel.image.fullname.worker" . }} imagePullPolicy: {{ .Values.image.pullPolicy }} ports: - containerPort: 5801 name: "hazelcast-port" {{- if .Values.worker.command }} command: {{ .Values.worker.command }} {{- else }} command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel-cluster.sh -r worker"] {{- end }} {{- if .Values.worker.resources }} resources: {{- toYaml .Values.worker.resources | nindent 12 }} {{- end }} {{- if .Values.worker.livenessProbe.enabled }} livenessProbe: {{- toYaml .Values.worker.livenessProbe | nindent 12 }} {{- end }} {{- if .Values.env }} env: {{- toYaml .Values.env | nindent 12 }} {{- end }} volumeMounts: # config mount {{- range $path, $_ := .Files.Glob "conf/*" }} - name: seatunnel-configs mountPath: /opt/seatunnel/config/{{ base $path }} subPath: {{ base $path }} {{- end }} volumes: - name: seatunnel-configs configMap: name: {{ include "seatunnel.configMapName" . }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/ingress.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # {{- if and .Values.ingress.enabled }} {{- if .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }} apiVersion: networking.k8s.io/v1 {{- else if .Capabilities.APIVersions.Has "networking.k8s.io/v1beta1/Ingress" }} apiVersion: networking.k8s.io/v1beta1 {{- else }} apiVersion: extensions/v1beta1 {{- end }} kind: Ingress metadata: name: {{ include "seatunnel.fullname" . }} labels: app.kubernetes.io/name: {{ include "seatunnel.fullname" . }} {{- include "seatunnel.common.labels" . | nindent 4 }} {{- with .Values.ingress.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} spec: {{- if .Values.ingress.className }} ingressClassName: {{ .Values.ingress.className }} {{- end }} rules: - host: "{{ .Values.ingress.host }}" http: paths: - path: {{ .Values.ingress.path }} backend: {{- if .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }} service: name: {{ include "seatunnel.fullname" . }}-master port: number: 8080 {{- else }} serviceName: {{ include "seatunnel.fullname" . }}-master servicePort: 8080 {{- end }} {{- if .Capabilities.APIVersions.Has "networking.k8s.io/v1/Ingress" }} pathType: Prefix {{- end }} {{- if .Values.ingress.tls.enabled }} tls: - hosts: - {{ .Values.ingress.host }} secretName: {{ .Values.ingress.tls.secretName }} {{- end }} {{- end }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/rbac.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: v1 kind: ServiceAccount metadata: labels: app: {{ template "seatunnel.fullname" . }} chart: {{ .Chart.Name }}-{{ .Chart.Version }} release: {{ .Release.Name }} name: {{ template "seatunnel.fullname" . }} --- kind: Role apiVersion: rbac.authorization.k8s.io/v1 metadata: name: {{ template "seatunnel.fullname" . }} labels: app: {{ template "seatunnel.fullname" . }} chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" release: "{{ .Release.Name }}" rules: - apiGroups: [""] resources: ["configmaps"] verbs: ["get", "watch", "list"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: {{ template "seatunnel.fullname" . }} labels: app: {{ template "seatunnel.fullname" . }} chart: "{{ .Chart.Name }}-{{ .Chart.Version }}" release: "{{ .Release.Name }}" roleRef: apiGroup: rbac.authorization.k8s.io kind: Role name: {{ template "seatunnel.fullname" . }} subjects: - kind: ServiceAccount name: {{ template "seatunnel.fullname" . }} namespace: {{ .Release.Namespace }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/service-headless.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # use for hazelcast cluster join apiVersion: v1 kind: Service metadata: name: {{ include "seatunnel.fullname" . }} labels: {{- include "seatunnel.common.labels" . | nindent 4 }} namespace: {{ .Values.namespace }} spec: type: ClusterIP clusterIP: None ports: - name: "hazelcast-port" port: 5801 selector: {{- include "seatunnel.common.labels" . | nindent 4 }} ================================================ FILE: deploy/kubernetes/seatunnel/templates/service-master-headless.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # use for access seatunnel from outside system via rest api apiVersion: v1 kind: Service metadata: name: {{ include "seatunnel.fullname" . }}-master labels: {{- include "seatunnel.master.labels" . | nindent 4 }} namespace: {{ .Values.namespace }} spec: clusterIP: "None" ports: - name: "master-port" port: 8080 targetPort: 8080 protocol: TCP selector: {{- include "seatunnel.master.labels" . | nindent 4 }} ================================================ FILE: deploy/kubernetes/seatunnel/values.yaml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Default values for seatunnel-chart. # This is a YAML-formatted file. # Declare variables to be passed into your templates. image: registry: "apache/seatunnel" tag: "" pullPolicy: "IfNotPresent" pullSecret: "" # ConfigMap settings configMap: # If true, create a new ConfigMap. If false, use existingConfigMapName create: true # Name of existing ConfigMap to use (only used when create=false) # The ConfigMap should contain all config files: hazelcast-client.yaml, hazelcast-master.yaml, # hazelcast-worker.yaml, jvm_client_options, jvm_master_options, jvm_worker_options, # log4j2.properties, seatunnel.yaml existingConfigMapName: "" # The env for pod env: - name: TZ value: Asia/Shanghai master: ## The command to start master. command: [] ## The deployment strategy to use to replace existing pods with new ones. strategy: type: RollingUpdate rollingUpdate: maxUnavailable: 25% maxSurge: 50% ## Replicas is the desired number of replicas of the given Template. replicas: "2" ## You can use annotations to attach arbitrary non-identifying metadata to objects. ## Clients such as tools and libraries can retrieve this metadata. annotations: prometheus.io/path: /hazelcast/rest/instance/metrics prometheus.io/port: "5801" prometheus.io/scrape: "true" prometheus.io/role: "seatunnel-master" ## Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. ## More info: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#affinity-v1-core affinity: {} ## NodeSelector is a selector which must be true for the pod to fit on a node. ## Selector which must match a node's labels for the pod to be scheduled on that node. ## More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ nodeSelector: {} ## Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, ## effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. tolerations: [] ## Compute Resources required by this container. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container resources: {} # resources: # limits: # memory: "4Gi" # cpu: "4" # requests: # memory: "2Gi" # cpu: "500m" ## Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes livenessProbe: tcpSocket: port: hazelcast-port initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 5 failureThreshold: 3 successThreshold: 1 ## Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes readinessProbe: enabled: true initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 5 failureThreshold: 3 successThreshold: 1 worker: ## The command to start worker. command: [] ## The deployment strategy to use to replace existing pods with new ones. strategy: type: RollingUpdate rollingUpdate: maxUnavailable: 25% maxSurge: 50% ## Replicas is the desired number of replicas of the given Template. replicas: "2" ## You can use annotations to attach arbitrary non-identifying metadata to objects. ## Clients such as tools and libraries can retrieve this metadata. ## Add enable prometheus scrape for metrics collection. annotations: prometheus.io/path: /hazelcast/rest/instance/metrics prometheus.io/port: "5801" prometheus.io/scrape: "true" prometheus.io/role: "seatunnel-worker" ## Affinity is a group of affinity scheduling rules. If specified, the pod's scheduling constraints. ## More info: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.17/#affinity-v1-core affinity: {} ## NodeSelector is a selector which must be true for the pod to fit on a node. ## Selector which must match a node's labels for the pod to be scheduled on that node. ## More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ nodeSelector: {} ## Tolerations are appended (excluding duplicates) to pods running with this RuntimeClass during admission, ## effectively unioning the set of nodes tolerated by the pod and the RuntimeClass. tolerations: [] ## Compute Resources required by this container. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container resources: {} # resources: # limits: # memory: "4Gi" # cpu: "4" # requests: # memory: "2Gi" # cpu: "500m" ## Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes livenessProbe: tcpSocket: port: hazelcast-port initialDelaySeconds: 30 periodSeconds: 30 timeoutSeconds: 5 failureThreshold: 3 successThreshold: 1 ## Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. ## More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes readinessProbe: enabled: true initialDelaySeconds: "30" periodSeconds: "30" timeoutSeconds: "5" failureThreshold: "3" successThreshold: "1" ingress: enabled: false className: "" host: seatunnel.k8s.local path: / annotations: {} tls: enabled: false secretName: "seatunnel-tls" ================================================ FILE: docs/en/architecture/api-design/catalog-table.md ================================================ --- sidebar_position: 4 title: CatalogTable and Metadata Management --- # CatalogTable and Metadata Management ## 1. Overview ### 1.1 Problem Background Data integration requires explicit schema management: - **Schema Definition**: How to define and validate table schemas? - **Schema Propagation**: How to pass schema through Source → Transform → Sink? - **Schema Evolution**: How to handle runtime DDL changes (ADD/DROP columns)? - **Type Mapping**: How to map types between different data sources? - **Metadata Completeness**: How to capture complete table metadata (constraints, partitions)? ### 1.2 Design Goals SeaTunnel's metadata management aims to: 1. **Type Safety**: Explicit schema validation at job submission 2. **Completeness**: Capture all table metadata (columns, constraints, partitions, options) 3. **Evolution Support**: Handle runtime schema changes (DDL synchronization) 4. **Engine Independence**: Schema representation independent of execution engine 5. **Ease of Use**: Simple API for schema creation and transformation ## 2. Core Concepts ### 2.1 CatalogTable Complete representation of a table with all metadata. ```java public class CatalogTable implements Serializable { // Table identifier private final TableIdentifier tableId; // Schema definition private final TableSchema tableSchema; // Table options (connector-specific configuration) private final Map options; // Partition keys private final List partitionKeys; // Comment private final String comment; // Catalog name private final String catalogName; } ``` **Key Components**: - `TableIdentifier`: Unique table identity (`catalog.database[.schema].table`) - `TableSchema`: Schema with columns, primary key, constraints - `options`: Connector-specific settings (e.g., Kafka topic, JDBC table name) - `partitionKeys`: Partition columns for partitioned tables ### 2.2 TableSchema Schema definition with columns and constraints. ```java public class TableSchema implements Serializable { // Column definitions private final List columns; // Primary key private final PrimaryKey primaryKey; // Unique/foreign key constraints private final List constraintKeys; } ``` ### 2.3 Column Column definition with type and constraints. ```java public class Column implements Serializable { private final String name; private final SeaTunnelDataType dataType; private final String comment; // Column options private final Map options; // Constraints private final boolean nullable; private final Object defaultValue; } ``` ### 2.4 SeaTunnelDataType Unified type system across connectors. **Basic Types**: ```java // Numeric DataTypes.TINYINT() DataTypes.SMALLINT() DataTypes.INT() DataTypes.BIGINT() DataTypes.FLOAT() DataTypes.DOUBLE() DataTypes.DECIMAL(precision, scale) // String DataTypes.STRING() DataTypes.CHAR(length) DataTypes.VARCHAR(length) // Binary DataTypes.BYTES() // Date/Time DataTypes.DATE() DataTypes.TIME() DataTypes.TIMESTAMP() // Boolean DataTypes.BOOLEAN() ``` **Complex Types**: ```java // Array DataTypes.ARRAY(elementType) // Map DataTypes.MAP(keyType, valueType) // Row (Struct) DataTypes.ROW(fields) ``` ## 3. Schema Creation ### 3.1 Builder Pattern ```java CatalogTable catalogTable = CatalogTable.of( TableIdentifier.of("my_catalog", "my_db", "my_table"), TableSchema.builder() .column("id", DataTypes.BIGINT()) .column("name", DataTypes.STRING()) .column("age", DataTypes.INT()) .column("created_at", DataTypes.TIMESTAMP()) .primaryKey("id") .build(), Map.of("connector", "jdbc"), Collections.emptyList(), // No partitions "User table" ); ``` ### 3.2 Column Builder ```java Column column = Column.builder() .name("user_id") .dataType(DataTypes.BIGINT()) .nullable(false) .defaultValue(0L) .comment("User identifier") .build(); ``` ### 3.3 Primary Key and Constraints ```java TableSchema schema = TableSchema.builder() .column("id", DataTypes.BIGINT()) .column("email", DataTypes.STRING()) .column("username", DataTypes.STRING()) // Primary key .primaryKey("id") // Unique constraint .constraint(ConstraintKey.of( ConstraintKey.ConstraintType.UNIQUE_KEY, "uk_email", Arrays.asList( ConstraintKey.ConstraintKeyColumn.of("email", null) ) )) .build(); ``` ## 4. Schema Propagation ### 4.1 Source → Transform → Sink Flow ``` ┌──────────────┐ │ Source │ │ │ │ produces │ │ CatalogTable │ └──────┬───────┘ │ ▼ (Input Schema) ┌──────────────┐ │ Transform │ │ │ │ modifies │ │ CatalogTable │ └──────┬───────┘ │ ▼ (Output Schema) ┌──────────────┐ │ Sink │ │ │ │ validates │ │ CatalogTable │ └──────────────┘ ``` ### 4.2 Source Schema Production ```java public class JdbcSource implements SeaTunnelSource<...> { @Override public List getProducedCatalogTables() { // Read schema from database metadata DatabaseMetaData metaData = connection.getMetaData(); ResultSet columns = metaData.getColumns(null, schema, table, null); String database = "..."; // Build schema TableSchema.Builder builder = TableSchema.builder(); while (columns.next()) { String columnName = columns.getString("COLUMN_NAME"); int jdbcType = columns.getInt("DATA_TYPE"); SeaTunnelDataType type = JdbcTypeConverter.convert(jdbcType); builder.column(columnName, type); } return Collections.singletonList( CatalogTable.of( TableIdentifier.of(catalog, database, schema, table), builder.build() ) ); } } ``` ### 4.3 Transform Schema Transformation ```java public class SqlTransform implements SeaTunnelTransform { @Override public CatalogTable getProducedCatalogTable() { CatalogTable inputTable = getInputCatalogTable(); // Parse SQL to infer output schema // Example: SELECT id, UPPER(name) as name_upper, age FROM input TableSchema outputSchema = TableSchema.builder() .column("id", inputTable.getColumn("id").getDataType()) .column("name_upper", DataTypes.STRING()) // Transformed .column("age", inputTable.getColumn("age").getDataType()) .build(); return inputTable.copy(outputSchema); } } ``` ### 4.4 Sink Schema Validation ```java public class JdbcSink implements SeaTunnelSink<...> { @Override public Optional getWriteCatalogTable() { // Validate input schema matches target table CatalogTable inputTable = getInputCatalogTable(); CatalogTable targetTable = readTargetTableSchema(); // Check column compatibility for (Column inputColumn : inputTable.getColumns()) { Column targetColumn = targetTable.getColumn(inputColumn.getName()); if (targetColumn == null) { throw new SchemaException("Column not found: " + inputColumn.getName()); } if (!isCompatible(inputColumn.getDataType(), targetColumn.getDataType())) { throw new SchemaException("Incompatible types for " + inputColumn.getName()); } } return Optional.of(targetTable); } } ``` ## 5. Schema Evolution ### 5.1 SchemaChangeEvent Represents DDL changes captured by CDC sources. ```java public abstract class SchemaChangeEvent implements Serializable { private final TableIdentifier tableId; } public class AlterTableAddColumnEvent extends SchemaChangeEvent { private final Column column; } public class AlterTableDropColumnEvent extends SchemaChangeEvent { private final String columnName; } public class AlterTableModifyColumnEvent extends SchemaChangeEvent { private final Column column; } ``` ### 5.2 CDC Source Schema Evolution ```java public class MysqlCDCSource { private void handleDDL(String ddl) { // Parse DDL statement if (ddl.contains("ADD COLUMN")) { Column newColumn = parseDDL(ddl); // Create schema change event SchemaChangeEvent event = new AlterTableAddColumnEvent( tableId, newColumn ); // Emit event downstream collector.collect(event); } } } ``` ### 5.3 Transform Schema Evolution Mapping ```java public class SqlTransform { @Override public SchemaChangeEvent mapSchemaChangeEvent(SchemaChangeEvent event) { if (event instanceof AlterTableAddColumnEvent) { AlterTableAddColumnEvent addEvent = (AlterTableAddColumnEvent) event; // Map column through transform logic Column transformedColumn = transformColumn(addEvent.getColumn()); return new AlterTableAddColumnEvent( event.getTableId(), transformedColumn ); } return event; // Pass through } } ``` ### 5.4 Sink Schema Evolution Application ```java public class JdbcSink { private void applySchemaChange(SchemaChangeEvent event) { if (event instanceof AlterTableAddColumnEvent) { AlterTableAddColumnEvent addEvent = (AlterTableAddColumnEvent) event; Column column = addEvent.getColumn(); // Generate DDL String ddl = String.format( "ALTER TABLE %s ADD COLUMN %s %s", event.getTableId().getTableName(), column.getName(), toSqlType(column.getDataType()) ); // Execute DDL statement.execute(ddl); LOG.info("Applied schema change: {}", ddl); } } } ``` ## 6. Type Mapping ### 6.1 JDBC Type Mapping ```java public class JdbcTypeConverter { public static SeaTunnelDataType convert(int jdbcType) { switch (jdbcType) { case Types.TINYINT: return DataTypes.TINYINT(); case Types.SMALLINT: return DataTypes.SMALLINT(); case Types.INTEGER: return DataTypes.INT(); case Types.BIGINT: return DataTypes.BIGINT(); case Types.FLOAT: case Types.REAL: return DataTypes.FLOAT(); case Types.DOUBLE: return DataTypes.DOUBLE(); case Types.DECIMAL: case Types.NUMERIC: return DataTypes.DECIMAL(precision, scale); case Types.CHAR: return DataTypes.CHAR(length); case Types.VARCHAR: return DataTypes.VARCHAR(length); case Types.LONGVARCHAR: return DataTypes.STRING(); case Types.DATE: return DataTypes.DATE(); case Types.TIME: return DataTypes.TIME(); case Types.TIMESTAMP: return DataTypes.TIMESTAMP(); case Types.BOOLEAN: return DataTypes.BOOLEAN(); case Types.BINARY: case Types.VARBINARY: case Types.LONGVARBINARY: return DataTypes.BYTES(); default: throw new UnsupportedTypeException("Unsupported JDBC type: " + jdbcType); } } } ``` ### 6.2 Kafka (Avro) Type Mapping ```java public class AvroTypeConverter { public static SeaTunnelDataType convert(Schema avroSchema) { switch (avroSchema.getType()) { case INT: return DataTypes.INT(); case LONG: return DataTypes.BIGINT(); case FLOAT: return DataTypes.FLOAT(); case DOUBLE: return DataTypes.DOUBLE(); case BOOLEAN: return DataTypes.BOOLEAN(); case STRING: return DataTypes.STRING(); case BYTES: return DataTypes.BYTES(); case ARRAY: return DataTypes.ARRAY(convert(avroSchema.getElementType())); case MAP: return DataTypes.MAP( DataTypes.STRING(), convert(avroSchema.getValueType()) ); case RECORD: // Convert to ROW type List fields = new ArrayList<>(); for (Schema.Field field : avroSchema.getFields()) { fields.add(new Column( field.name(), convert(field.schema()) )); } return DataTypes.ROW(fields); default: throw new UnsupportedTypeException("Unsupported Avro type: " + avroSchema.getType()); } } } ``` ## 7. Partitioned Tables ### 7.1 Partition Definition ```java CatalogTable catalogTable = CatalogTable.of( tableId, schema, options, Arrays.asList("year", "month", "day"), // Partition keys comment ); ``` ### 7.2 Partition-Aware Source ```java public class HiveSource { @Override public CatalogTable getProducedCatalogTable() { // Read Hive table metadata Table hiveTable = hiveMetastore.getTable(dbName, tableName); // Extract partition keys List partitionKeys = hiveTable.getPartitionKeys().stream() .map(FieldSchema::getName) .collect(Collectors.toList()); return CatalogTable.of( tableId, schema, options, partitionKeys, comment ); } } ``` ### 7.3 Partition-Aware Sink ```java public class IcebergSink { private void write(SeaTunnelRow row, CatalogTable table) { // Extract partition values from row Map partitionValues = new HashMap<>(); for (String partitionKey : table.getPartitionKeys()) { int index = table.getSchema().indexOf(partitionKey); partitionValues.put(partitionKey, row.getField(index)); } // Write to correct partition PartitionSpec spec = PartitionSpec.builderFor(schema) .identity("year") .identity("month") .identity("day") .build(); DataFile dataFile = writeToPartition(partitionValues, row); icebergTable.newAppend().appendFile(dataFile).commit(); } } ``` ## 8. Best Practices ### 8.1 Schema Definition **Prefer Explicit Schema**: ```java // ✅ GOOD: Explicit schema TableSchema schema = TableSchema.builder() .column("id", DataTypes.BIGINT()) .column("name", DataTypes.STRING()) .build(); // ❌ BAD: Implicit schema (relies on inference) // Schema inferred from first row - risky! ``` **Use Appropriate Types**: ```java // ✅ GOOD: Use specific types .column("price", DataTypes.DECIMAL(10, 2)) .column("created_at", DataTypes.TIMESTAMP()) // ❌ BAD: Overly generic types .column("price", DataTypes.STRING()) // Should be DECIMAL .column("created_at", DataTypes.STRING()) // Should be TIMESTAMP ``` ### 8.2 Schema Validation **Validate Early**: ```java // In Source @Override public void open() { CatalogTable catalogTable = getProducedCatalogTables().get(0); validateSchema(catalogTable); // Fail fast } // In Sink @Override public void open() { CatalogTable inputTable = getInputCatalogTable(); CatalogTable targetTable = getWriteCatalogTable().orElseThrow(IllegalStateException::new); validateCompatibility(inputTable, targetTable); // Fail fast } ``` ### 8.3 Type Compatibility **Type Widening (Safe)**: ```java // INT → BIGINT (safe) // FLOAT → DOUBLE (safe) // VARCHAR(10) → VARCHAR(20) (safe) ``` **Type Narrowing (Unsafe)**: ```java // BIGINT → INT (may overflow) // DOUBLE → FLOAT (precision loss) // VARCHAR(20) → VARCHAR(10) (truncation) ``` ## 9. Configuration ### 9.1 Schema Override ```hocon source { JDBC { url = "..." query = "SELECT * FROM users" # Override inferred schema schema { fields { id = "BIGINT" name = "STRING" age = "INT" } } } } ``` ### 9.2 Schema Evolution Control ```hocon sink { JDBC { url = "..." # Schema evolution options schema-evolution { enabled = true auto-create-table = true auto-add-column = true auto-drop-column = false # Dangerous! } } } ``` ## 10. Related Resources - [Source Architecture](source-architecture.md) - [Sink Architecture](sink-architecture.md) - [Schema Evolution](../../introduction/concepts/schema-evolution.md) - [Schema Feature](../../introduction/concepts/schema-feature.md) ## 11. References ### Key Source Files - [CatalogTable.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java) - [TableSchema.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TableSchema.java) - [Column.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java) - [SeaTunnelDataType.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelDataType.java) - [SchemaChangeEvent.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/SchemaChangeEvent.java) ================================================ FILE: docs/en/architecture/api-design/sink-architecture.md ================================================ --- sidebar_position: 3 title: Sink Architecture --- # Sink Architecture ## 1. Overview ### 1.1 Problem Background Writing data to external systems in distributed environments presents critical challenges: - **Exactly-Once Guarantee**: How to ensure each record is written exactly once, not zero or multiple times? - **Transactional Consistency**: How to commit writes atomically across multiple parallel writers? - **Fault Tolerance**: How to recover from failures without data loss or duplication? - **Backpressure**: How to handle slow sinks without overwhelming the system? - **Idempotency**: How to make retries safe? ### 1.2 Design Goals SeaTunnel's Sink API aims to: 1. **Provide Verifiable Consistency Semantics**: With checkpoint boundaries + 2PC, achieve exactly-once when the external sink supports transactional/idempotent commit 2. **Support Parallel Writes**: Scale throughput with multiple writer instances 3. **Enable Global Coordination**: Coordinate commits across distributed writers 4. **Ensure Fault Tolerance**: Recover from failures without data inconsistency 5. **Provide Flexibility**: Support various commit strategies (per-writer, aggregated, none) ### 1.3 Applicable Scenarios - Transactional databases (JDBC with XA transactions) - Message queues (Kafka with transactions) - File systems (atomic file rename) - Data lakes (Iceberg, Hudi, Delta Lake with table transactions) - Search engines (Elasticsearch with versioning) ## 2. Architecture Design ### 2.1 Overall Architecture ``` ┌────────────────────────────────────────────────────────────────┐ │ TaskExecutionService (Worker Side) │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkWriter │ │ │ │ │ │ │ │ • Receive records from upstream │ │ │ │ • Buffer and write data │ │ │ │ • Produce commitInfo at checkpoint boundary │ │ │ │ • Snapshot writer state │ │ │ │ • Cleanup/rollback on failure (engine-dependent) │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ │ └────────────────────────────┼─────────────────────────────────────┘ │ (CommitInfo) ▼ ┌────────────────────────────────────────────────────────────────┐ │ Coordinator Side (control plane, engine-dependent) │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkCommitter (Optional) │ │ │ │ │ │ │ │ • Receive commit infos from multiple writers │ │ │ │ • Commit each writer's changes independently │ │ │ │ • Retry failed commits │ │ │ │ • Must be idempotent │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ │ │ │ (Optional: AggregatedCommitInfo) │ │ ▼ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkAggregatedCommitter │ │ │ │ (Optional) │ │ │ │ │ │ │ │ • Aggregate commit infos from all writers │ │ │ │ • Perform single global commit operation │ │ │ │ • Single-threaded, global coordinator │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ External Data Sink (Database / File / Message Queue) ``` ### 2.2 Core Components #### SeaTunnelSink (Factory Interface) The top-level interface that serves as a factory for creating writers and committers. ```java public interface SeaTunnelSink extends Serializable { /** * Create SinkWriter (called on worker) */ SinkWriter createWriter(SinkWriter.Context context) throws IOException; /** * Restore SinkWriter from checkpoint (called on worker) */ default SinkWriter restoreWriter( SinkWriter.Context context, List states) throws IOException { return createWriter(context); } /** * Serializer for writer state (optional). */ default Optional> getWriterStateSerializer() { return Optional.empty(); } /** * Create SinkCommitter (optional, trigger location depends on execution engine) */ default Optional> createCommitter() throws IOException { return Optional.empty(); } /** * Serializer for commit info (optional). */ default Optional> getCommitInfoSerializer() { return Optional.empty(); } /** * Create SinkAggregatedCommitter (optional). */ default Optional> createAggregatedCommitter() throws IOException { return Optional.empty(); } /** * Serializer for aggregated commit info (optional). */ default Optional> getAggregatedCommitInfoSerializer() { return Optional.empty(); } /** * Get input schema. */ default Optional getWriteCatalogTable() { return Optional.empty(); } } ``` **Key Design Points**: - Three-tier commit architecture: Writer → Committer → AggregatedCommitter - Committer and AggregatedCommitter are optional (depends on sink requirements) - Writer is always required (performs actual data writing) ### 2.3 Interaction Flow #### Normal Write Flow (with Two-Phase Commit) ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Writer1 as SinkWriter 1 participant Writer2 as SinkWriter 2 participant Committer as SinkCommitter participant Sink as External Sink Writer1->>Writer1: write(record) Writer2->>Writer2: write(record) CP->>Writer1: triggerBarrier(checkpointId) CP->>Writer2: triggerBarrier(checkpointId) Writer1->>Writer1: prepareCommit(checkpointId) Writer1->>CP: ack(commitInfo1) Writer2->>Writer2: prepareCommit(checkpointId) Writer2->>CP: ack(commitInfo2) CP->>CP: All writers acked CP->>CP: Persist checkpoint CP->>Committer: commit([commitInfo1, commitInfo2]) Committer->>Sink: Commit writer1 changes Committer->>Sink: Commit writer2 changes Committer->>CP: ack() Note over Writer1,Writer2: Framework may notify checkpoint completion for cleanup (engine-dependent) ``` #### Failure and Retry Flow ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Writer as SinkWriter participant Committer as SinkCommitter participant Sink as External Sink Writer->>Writer: prepareCommit(checkpointId) Writer->>CP: ack(commitInfo) CP->>Writer: [Failure - writer crashes] CP->>CP: Checkpoint fails CP->>CP: Restore from previous checkpoint CP->>Writer: restoreWriter(previousState) Writer->>Writer: Replay records from checkpoint Writer->>Writer: prepareCommit(checkpointId) Writer->>CP: ack(commitInfo) CP->>Committer: commit([commitInfo]) Committer->>Sink: Commit (idempotent) Committer-->>Sink: [Commit fails due to network] Committer->>Committer: Retry Committer->>Sink: Commit (idempotent) Sink-->>Committer: Success Note over Writer,Committer: Framework may notify checkpoint completion for cleanup (engine-dependent) ``` ## 3. Key Implementations ### 3.1 SinkWriter Interface The writer runs on workers and performs actual data writing. ```java public interface SinkWriter { /** * Write single record */ void write(IN element) throws IOException; /** * Prepare commit info during checkpoint. * * Guideline: do not make data externally visible in this phase. */ Optional prepareCommit(long checkpointId) throws IOException; /** * Abort prepared commit if checkpoint fails */ void abortPrepare(); /** * Snapshot writer state for checkpoint */ List snapshotState(long checkpointId) throws IOException; /** * Close writer */ void close() throws IOException; /** * Context for interacting with framework */ interface Context { int getIndexOfSubtask(); MetricsContext getMetricsContext(); } } ``` **Critical Requirements**: - `prepareCommit(checkpointId)` should not make data externally visible (commit is done in `SinkCommitter` / `SinkAggregatedCommitter`) - `prepareCommit(checkpointId)` returns commit info that will be passed to committer - State returned by `snapshotState()` must capture all uncommitted writes - `abortPrepare()` is only used by Spark when `prepareCommit(...)` fails by throwing an exception **Implementation Example (JDBC with XA Transactions)**: ```java public class JdbcExactlyOnceSinkWriter implements SinkWriter { private final XAConnection xaConnection; private final XAResource xaResource; private final Connection connection; private final PreparedStatement statement; private final List pendingXids = new ArrayList<>(); @Override public void write(SeaTunnelRow element) throws IOException { try { // Start XA transaction if needed if (currentXid == null) { currentXid = generateXid(); xaResource.start(currentXid, XAResource.TMNOFLAGS); } // Execute INSERT (buffered in transaction) setParameters(statement, element); statement.executeUpdate(); } catch (SQLException e) { throw new IOException("Failed to write record", e); } } @Override public Optional prepareCommit(long checkpointId) throws IOException { if (currentXid == null) { return Optional.empty(); // No data written } try { // End XA transaction xaResource.end(currentXid, XAResource.TMSUCCESS); // Prepare XA transaction (FIRST PHASE - no side effects yet) xaResource.prepare(currentXid); // Return XID for committer XidInfo xidInfo = new XidInfo(currentXid); pendingXids.add(currentXid); currentXid = null; return Optional.of(xidInfo); } catch (XAException e) { throw new IOException("Failed to prepare XA transaction", e); } } @Override public void abortPrepare() { // Rollback prepared transaction if (currentXid != null) { try { xaResource.rollback(currentXid); } catch (XAException e) { LOG.error("Failed to rollback XA transaction", e); } } } @Override public List snapshotState(long checkpointId) { // For XA, state is managed by database return Collections.emptyList(); } } ``` **Implementation Example (File Sink with Atomic Rename)**: ```java public class FileSinkWriter implements SinkWriter { private final String tempFilePath; private final String finalFilePath; private final OutputStream outputStream; private long bytesWritten = 0; @Override public void write(SeaTunnelRow element) throws IOException { // Write to temporary file byte[] bytes = serialize(element); outputStream.write(bytes); bytesWritten += bytes.length; } @Override public Optional prepareCommit(long checkpointId) throws IOException { // Flush and close temp file (no rename yet!) outputStream.flush(); outputStream.close(); // Return commit info for committer to rename file return Optional.of(new FileCommitInfo(tempFilePath, finalFilePath)); } @Override public void abortPrepare() { // Delete temporary file new File(tempFilePath).delete(); } @Override public List snapshotState(long checkpointId) { // Save current write position return Collections.singletonList(new FileWriterState(bytesWritten)); } } ``` ### 3.2 SinkCommitter Interface The committer runs on master and coordinates commits from multiple writers. ```java public interface SinkCommitter extends Closeable { /** * Commit multiple commit infos (from multiple writers or retries) * MUST be idempotent - may be called multiple times with same commitInfo */ List commit(List commitInfos) throws IOException; /** * Abort commit infos (optional) */ default void abort(List commitInfos) throws IOException {} /** * Close committer */ void close() throws IOException; } ``` **Critical Requirements**: - `commit()` **MUST** be idempotent (calling twice with same commitInfo should be safe) - Returns list of **failed** commitInfos (will be retried) - Should handle partial failures gracefully **Implementation Example (JDBC XA Committer)**: ```java public class JdbcSinkCommitter implements SinkCommitter { private final XADataSource xaDataSource; @Override public List commit(List commitInfos) throws IOException { List failed = new ArrayList<>(); for (XidInfo xidInfo : commitInfos) { try { XAConnection xaConn = xaDataSource.getXAConnection(); XAResource xaResource = xaConn.getXAResource(); // SECOND PHASE: Commit prepared transaction xaResource.commit(xidInfo.getXid(), false); xaConn.close(); } catch (XAException e) { if (e.errorCode == XAException.XAER_NOTA) { // Transaction already committed (idempotent) LOG.info("XA transaction already committed: {}", xidInfo.getXid()); } else { // Commit failed, will retry LOG.error("Failed to commit XA transaction: {}", xidInfo.getXid(), e); failed.add(xidInfo); } } } return failed; // Framework will retry failed commits } @Override public void abort(List commitInfos) { // Rollback prepared transactions for (XidInfo xidInfo : commitInfos) { try { XAConnection xaConn = xaDataSource.getXAConnection(); xaConn.getXAResource().rollback(xidInfo.getXid()); xaConn.close(); } catch (Exception e) { LOG.error("Failed to rollback XA transaction", e); } } } } ``` **Implementation Example (File Committer with Atomic Rename)**: ```java public class FileSinkCommitter implements SinkCommitter { private final FileSystem fileSystem; @Override public List commit(List commitInfos) { List failed = new ArrayList<>(); for (FileCommitInfo commitInfo : commitInfos) { try { Path tempPath = new Path(commitInfo.getTempFilePath()); Path finalPath = new Path(commitInfo.getFinalFilePath()); // Atomic rename (commit) if (fileSystem.exists(finalPath)) { // File already committed (idempotent) LOG.info("File already exists, skipping: {}", finalPath); fileSystem.delete(tempPath, false); // Clean up temp file } else { boolean success = fileSystem.rename(tempPath, finalPath); if (!success) { failed.add(commitInfo); } } } catch (IOException e) { LOG.error("Failed to commit file: {}", commitInfo, e); failed.add(commitInfo); } } return failed; } } ``` ### 3.3 SinkAggregatedCommitter Interface The aggregated committer performs single global commit for all writers. ```java public interface SinkAggregatedCommitter extends Closeable { /** * Combine commit infos from multiple writers into single aggregated info */ AggregatedCommitInfoT combine(List commitInfos); /** * Commit aggregated info (single global operation) * MUST be idempotent */ List commit(List aggregatedCommitInfos) throws IOException; /** * Abort aggregated commit infos */ default void abort(List aggregatedCommitInfos) throws IOException {} /** * Restore committer state from checkpoint */ default void restoreCommit(List aggregatedCommitInfos) throws IOException {} /** * Close committer */ void close() throws IOException; } ``` **Use Cases**: - Hive table commit (single COMMIT TRANSACTION for all partitions) - Iceberg table commit (single table snapshot) - Global index updates (update index once for all writes) **Implementation Example (Hive Sink)**: ```java public class HiveAggregatedCommitter implements SinkAggregatedCommitter { @Override public HiveCommitInfo combine(List commitInfos) { // Collect all written files across all writers List allFiles = new ArrayList<>(); for (HiveWriteInfo writeInfo : commitInfos) { allFiles.addAll(writeInfo.getWrittenFiles()); } return new HiveCommitInfo(allFiles); } @Override public List commit(List aggregatedCommitInfos) { List failed = new ArrayList<>(); for (HiveCommitInfo commitInfo : aggregatedCommitInfos) { try { // Single global commit for entire table hiveMetastore.beginTransaction(); for (String file : commitInfo.getAllFiles()) { hiveMetastore.addPartitionFile(tableName, file); } hiveMetastore.commitTransaction(); // Global atomic commit } catch (Exception e) { LOG.error("Failed to commit to Hive", e); hiveMetastore.rollbackTransaction(); failed.add(commitInfo); } } return failed; } } ``` ### 3.4 Code References **API Interfaces**: - [SeaTunnelSink.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SeaTunnelSink.java) - [SinkWriter.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java) - [SinkCommitter.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkCommitter.java) - [SinkAggregatedCommitter.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkAggregatedCommitter.java) **Example Implementations**: - JDBC Sink: `seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/` - Kafka Sink: `seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/sink/` - File Sink: `seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/` ## 4. Design Considerations ### 4.1 Design Trade-offs #### Two-Phase Commit **Pros**: - Strong consistency guarantee (exactly-once) - Automatic failure recovery - Clear separation between prepare and commit **Cons**: - Increased latency (data visible only after commit) - Requires transactional support in sink - Additional state for commit info - More complex implementation **When to Use**: - Financial transactions, billing, audit logs - Any scenario requiring exactly-once guarantee **When Not to Use**: - At-least-once is acceptable (logging, metrics) - Sink doesn't support transactions - Ultra-low latency required #### Three-Tier vs Two-Tier Commit **Two-Tier (Writer → Committer)**: - Each writer's commit handled independently - Parallel commit operations - Suitable for most sinks **Three-Tier (Writer → Committer → AggregatedCommitter)**: - All writers' commits aggregated into single operation - Single global commit point - Required for table-level transactions (Hive, Iceberg) ### 4.2 Performance Considerations #### Batch Writing ```java public class BatchSinkWriter { private final List batch = new ArrayList<>(); private static final int BATCH_SIZE = 1000; @Override public void write(SeaTunnelRow element) { batch.add(element); if (batch.size() >= BATCH_SIZE) { flushBatch(); } } private void flushBatch() { // Write entire batch in single operation statement.executeBatch(); batch.clear(); } } ``` **Benefits**: - Amortize per-record overhead - Reduce network round-trips - Better throughput #### Async Writes ```java public class AsyncSinkWriter { private final BlockingQueue> pendingWrites = new LinkedBlockingQueue<>(); @Override public void write(SeaTunnelRow element) { CompletableFuture future = CompletableFuture.runAsync(() -> { // Async write operation actualWrite(element); }, executorService); pendingWrites.add(future); } @Override public Optional prepareCommit(long checkpointId) { // Wait for all pending writes to complete for (CompletableFuture future : pendingWrites) { future.join(); } pendingWrites.clear(); return Optional.of(createCommitInfo()); } } ``` #### Connection Pooling ```java public class JdbcSinkWriter { private final HikariDataSource dataSource; @Override public void write(SeaTunnelRow element) { try (Connection conn = dataSource.getConnection()) { // Reuse pooled connections PreparedStatement stmt = conn.prepareStatement(sql); stmt.executeUpdate(); } } } ``` ### 4.3 Idempotency Patterns #### 1. Natural Idempotency (Upsert) ```java // INSERT ON DUPLICATE KEY UPDATE (MySQL) String sql = "INSERT INTO table (id, name) VALUES (?, ?) " + "ON DUPLICATE KEY UPDATE name = VALUES(name)"; // MERGE INTO (Oracle, SQL Server) String sql = "MERGE INTO table USING (SELECT ? as id, ? as name FROM dual) src " + "ON (table.id = src.id) " + "WHEN MATCHED THEN UPDATE SET table.name = src.name " + "WHEN NOT MATCHED THEN INSERT (id, name) VALUES (src.id, src.name)"; ``` #### 2. Deduplication Key ```java public class KafkaSinkWriter { @Override public void write(SeaTunnelRow element) { ProducerRecord record = new ProducerRecord<>( topic, element.getField(0).toString(), // Key for deduplication element.toString() ); // Kafka deduplicates based on (topic, partition, offset, idempotent producer) producer.send(record); } } ``` #### 3. External Deduplication Table ```java public class JdbcCommitter { @Override public List commit(List commitInfos) { for (XidInfo xidInfo : commitInfos) { String xidString = xidInfo.getXid().toString(); // Check if already committed boolean exists = checkCommitTable(xidString); if (exists) { LOG.info("XID already committed: {}", xidString); continue; // Idempotent } // Commit transaction xaResource.commit(xidInfo.getXid(), false); // Record commit insertCommitTable(xidString, System.currentTimeMillis()); } } } ``` ## 5. Best Practices ### 5.1 Usage Recommendations **1. Choose Appropriate Commit Level** ```java // Simple sink: Writer only (at-least-once) public class SimpleSink implements SeaTunnelSink<...> { SinkWriter createWriter(...) { return new SimpleWriter(); } // No committer - data written directly } // Transactional sink: Writer + Committer (exactly-once) public class TransactionalSink implements SeaTunnelSink<...> { SinkWriter createWriter(...) { return new TransactionalWriter(); } Optional createCommitter() { return Optional.of(new Committer()); } } // Table sink: Writer + Committer + AggregatedCommitter public class TableSink implements SeaTunnelSink<...> { SinkWriter createWriter(...) { return new TableWriter(); } Optional createCommitter() { return Optional.of(new Committer()); } Optional createAggregatedCommitter() { return Optional.of(new AggregatedCommitter()); } } ``` **2. Proper State Management** ```java public class StatefulSinkWriter { private long recordsWritten = 0; private long bytesWritten = 0; @Override public List snapshotState(long checkpointId) { return Collections.singletonList( new WriterState(recordsWritten, bytesWritten) ); } public StatefulSinkWriter restoreState(List states) { if (!states.isEmpty()) { WriterState state = states.get(0); this.recordsWritten = state.getRecordsWritten(); this.bytesWritten = state.getBytesWritten(); } return this; } } ``` **3. Resource Management** ```java @Override public void close() throws IOException { // Close in reverse order of creation if (statement != null) statement.close(); if (connection != null) connection.close(); if (dataSource != null) dataSource.close(); } ``` ### 5.2 Common Pitfalls **1. Side Effects in prepareCommit(checkpointId)** ```java // ❌ BAD: Actual commit in prepareCommit(checkpointId) public Optional prepareCommit(long checkpointId) { connection.commit(); // WRONG! This is a side effect! return Optional.of(new CommitInfo()); } // ✅ GOOD: Only prepare, no side effects public Optional prepareCommit(long checkpointId) { xaResource.end(xid, XAResource.TMSUCCESS); xaResource.prepare(xid); // Prepare only, no commit yet return Optional.of(new XidInfo(xid)); } ``` **2. Non-Idempotent Commit** ```java // ❌ BAD: Direct INSERT (not idempotent) public List commit(List commitInfos) { for (CommitInfo info : commitInfos) { executeInsert(info); // May fail if called twice! } } // ✅ GOOD: UPSERT (idempotent) public List commit(List commitInfos) { for (CommitInfo info : commitInfos) { executeUpsert(info); // Safe to call multiple times } } ``` **3. Large State** ```java // ❌ BAD: Buffer all records in state public class BadWriter { private List bufferedRows = new ArrayList<>(); // May be huge! public List snapshotState() { return Collections.singletonList(new State(bufferedRows)); } } // ✅ GOOD: Flush before checkpoint, track metadata only public class GoodWriter { private long lastCommittedOffset = 0; public Optional prepareCommit(long checkpointId) { flushBufferedRows(); // Write to external system return Optional.of(new CommitInfo(lastCommittedOffset)); } } ``` ### 5.3 Debugging Tips **1. Enable XA Transaction Logging** ```java // Log XA operations for debugging LOG.info("Starting XA transaction: {}", xid); xaResource.start(xid, XAResource.TMNOFLAGS); LOG.info("Preparing XA transaction: {}", xid); xaResource.prepare(xid); LOG.info("Committing XA transaction: {}", xid); xaResource.commit(xid, false); ``` **2. Track Commit Progress** ```java public class MonitoredCommitter { private final Counter commitAttempts = metricGroup.counter("commit_attempts"); private final Counter commitSuccesses = metricGroup.counter("commit_successes"); private final Counter commitFailures = metricGroup.counter("commit_failures"); public List commit(List commitInfos) { commitAttempts.inc(commitInfos.size()); List failed = new ArrayList<>(); for (CommitInfo info : commitInfos) { try { doCommit(info); commitSuccesses.inc(); } catch (Exception e) { commitFailures.inc(); failed.add(info); } } return failed; } } ``` **3. Test Failure Scenarios** ```java @Test public void testCheckpointFailureRecovery() { // Write data writer.write(row1); writer.write(row2); // Prepare commit Optional commitInfo = writer.prepareCommit(checkpointId); // Simulate checkpoint failure writer.abortPrepare(); // Verify no data committed assertFalse(dataExistsInSink()); // Restore and retry writer.write(row1); writer.write(row2); commitInfo = writer.prepareCommit(checkpointId); // Commit should succeed committer.commit(Collections.singletonList(commitInfo.get())); assertTrue(dataExistsInSink()); } ``` ## 6. Related Resources - [Architecture Overview](../overview.md) - [Design Philosophy](../design-philosophy.md) - [Source Architecture](source-architecture.md) - [Checkpoint Mechanism](../fault-tolerance/checkpoint-mechanism.md) - [Exactly-Once Semantics](../fault-tolerance/exactly-once.md) ## 7. References ### Example Connectors - **Simple Sink**: ConsoleSink (logs to stdout) - **File Sink**: FileSink (atomic file rename) - **Database Sink**: JdbcSink (XA transactions) - **Streaming Sink**: KafkaSink (Kafka transactions) - **Table Sink**: IcebergSink (table commits) ### Further Reading - [Two-Phase Commit Protocol](https://en.wikipedia.org/wiki/Two-phase_commit_protocol) - [XA Transactions](https://www.oracle.com/java/technologies/xa-transactions.html) - [Kafka Transactions](https://kafka.apache.org/documentation/#semantics) - [Iceberg Table Format](https://iceberg.apache.org/spec/) ================================================ FILE: docs/en/architecture/api-design/source-architecture.md ================================================ --- sidebar_position: 2 title: Source Architecture --- # Source Architecture ## 1. Overview ### 1.1 Problem Background Data sources in distributed systems present several challenges: - **Parallelism**: How to read data in parallel from a single source? - **Fault Tolerance**: How to resume from where we left off after failures? - **Dynamic Assignment**: How to handle worker failures and redistribute work? - **Bounded vs Unbounded**: How to unify batch and streaming sources? - **Backpressure**: How to handle slow downstream processing? ### 1.2 Design Goals SeaTunnel's Source API aims to: 1. **Enable Parallel Reading**: Support split-based parallelism for scalability 2. **Ensure Fault Tolerance**: Checkpoint split state for exactly-once processing 3. **Separate Coordination from Execution**: Enumerator (master) and Reader (worker) separation 4. **Support Dynamic Assignment**: Reassign splits on failures or imbalance 5. **Unify Batch and Streaming**: Single API for both bounded and unbounded sources ### 1.3 Applicable Scenarios - File-based sources (local files, HDFS, S3, OSS) - Database sources (MySQL, PostgreSQL, Oracle, JDBC-compatible) - Message queue sources (Kafka, Pulsar, RabbitMQ) - CDC sources (MySQL CDC, PostgreSQL CDC, Oracle CDC) - Stream sources (Socket, HTTP, custom protocols) ## 2. Architecture Design ### 2.1 Overall Architecture ``` ┌──────────────────────────────────────────────────────────────┐ │ Coordinator (master/coordinator side) │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ SourceSplitEnumerator │ │ │ │ │ │ │ │ • Discover/generate splits in run() (impl-defined) │ │ │ │ • Assign splits to readers │ │ │ │ • Handle reader registration │ │ │ │ • Handle split requests │ │ │ │ • Reclaim splits from failed readers │ │ │ │ • Snapshot enumerator state │ │ │ │ • Send/receive custom events │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ └────────────────────────────┼───────────────────────────────────┘ │ (Split Assignment) ▼ ┌──────────────────────────────────────────────────────────────┐ │ TaskExecutionService (Worker Side) │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ SourceReader │ │ │ │ │ │ │ │ • Receive assigned splits │ │ │ │ • Read data from splits │ │ │ │ • Emit records downstream │ │ │ │ • Snapshot reader state (split progress) │ │ │ │ • Handle split completion │ │ │ │ • Send/receive custom events │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ └────────────────────────────┼───────────────────────────────────┘ │ ▼ SeaTunnelRow (to Transform/Sink) ``` ### 2.2 Core Components #### SeaTunnelSource (Factory Interface) The top-level interface that serves as a factory for creating readers and enumerators. ```java public interface SeaTunnelSource extends Serializable { /** * Get source boundedness (BOUNDED for batch, UNBOUNDED for streaming) */ Boundedness getBoundedness(); /** * Create SourceReader (called on worker) */ SourceReader createReader(SourceReader.Context readerContext) throws Exception; /** * Split serializer used for network transfer and checkpointing. */ Serializer getSplitSerializer(); /** * Create SourceSplitEnumerator (called on master) */ SourceSplitEnumerator createEnumerator( SourceSplitEnumerator.Context enumeratorContext) throws Exception; /** * Restore SourceSplitEnumerator from checkpoint (called on master) */ SourceSplitEnumerator restoreEnumerator( SourceSplitEnumerator.Context enumeratorContext, StateT checkpointState) throws Exception; /** * Enumerator-state serializer used for checkpointing. */ Serializer getEnumeratorStateSerializer(); /** * Get output schema (CatalogTable list, supports multi-table) */ List getProducedCatalogTables(); } ``` **Key Methods**: - `getBoundedness()`: Indicates if source is bounded (batch) or unbounded (stream) - `createReader()`: Factory for reader instances (one per worker task) - `createEnumerator()`: Factory for enumerator (single instance on master) - `restoreEnumerator()`: Restore enumerator from checkpoint state - `getProducedCatalogTables()`: Defines output schema (supports multi-table) - `getSplitSerializer()` / `getEnumeratorStateSerializer()`: Split/enumerator-state serializers for network transfer and checkpointing #### SourceSplit (Minimal Serializable Unit) Represents a partitionable unit of data. ```java public interface SourceSplit extends Serializable { /** * Unique identifier for this split */ String splitId(); } ``` **Implementation Examples**: ```java // File-based split public class FileSplit implements SourceSplit { private final String splitId; private final String filePath; private final long startOffset; private final long length; } // JDBC-based split (query range) public class JdbcSourceSplit implements SourceSplit { private final String splitId; private final String query; private final Object[] queryParams; } // Kafka-based split (partition) public class KafkaSourceSplit implements SourceSplit { private final String splitId; private final String topic; private final int partition; private final long startOffset; } ``` **Design Notes**: - Splits must be serializable for network transfer - Split state (e.g., current offset) stored separately in reader state - Splits can be reassigned to different readers ### 2.3 Interaction Flow #### Initial Startup Flow ```mermaid sequenceDiagram participant Coord as Coordinator participant Enum as SourceSplitEnumerator participant Worker as TaskExecutionService participant Reader as SourceReader Coord->>Enum: createEnumerator(context) Enum->>Enum: open() Worker->>Reader: createReader(context) Reader->>Reader: open() Coord->>Enum: registerReader(subtaskId) Enum->>Enum: run() (discover/generate splits, impl-defined) Reader->>Enum: context.sendSplitRequest() Enum->>Enum: handleSplitRequest(subtaskId) Enum->>Reader: assignSplit(splits) Reader->>Reader: addSplits(splits) Reader->>Reader: pollNext(collector) Reader->>Worker: collect(record) ``` #### Checkpoint Flow ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Enum as SourceSplitEnumerator participant Reader as SourceReader CP->>Reader: triggerBarrier(checkpointId) Reader->>Reader: snapshotState(checkpointId) Reader->>CP: ack(readerState) CP->>Enum: snapshotState(checkpointId) Enum->>Enum: snapshot enumerator state Enum->>CP: ack(enumeratorState) CP->>CP: All acks received CP->>CP: Persist checkpoint ``` #### Failure Recovery Flow ```mermaid sequenceDiagram participant Coord as Coordinator participant Enum as SourceSplitEnumerator participant OldReader as Failed Reader participant NewReader as New Reader OldReader->>OldReader: [Failure] Coord->>Enum: addSplitsBack(splits, subtaskId) Enum->>Enum: Mark splits as pending Coord->>NewReader: Deploy on new worker NewReader->>NewReader: Restore from checkpoint (reader state) Coord->>Enum: registerReader(subtaskId) Enum->>NewReader: assignSplit(recovered splits) NewReader->>NewReader: Resume from checkpointed offset ``` ## 3. Key Implementations ### 3.1 SourceSplitEnumerator Interface The enumerator runs on the master side and coordinates split assignment. ```java public interface SourceSplitEnumerator extends AutoCloseable, CheckpointListener { /** * Called when enumerator starts */ void open(); /** * Executes split discovery and background coordination logic. * * Note: run() and snapshotState() may be invoked concurrently by different threads. */ void run() throws Exception; /** * Add a split back to the enumerator for reassignment (typically after reader failure). */ void addSplitsBack(List splits, int subtaskId); /** * Current number of unassigned splits. */ int currentUnassignedSplitSize(); /** * Called when a reader requests more splits. */ void handleSplitRequest(int subtaskId); /** * Called when a reader registers. */ void registerReader(int subtaskId); /** * Snapshot enumerator state for checkpoint */ StateT snapshotState(long checkpointId) throws Exception; /** * Handle custom event from reader */ default void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) {} /** * Close enumerator */ void close() throws IOException; /** * Context for interacting with framework */ interface Context { int currentParallelism(); Set registeredReaders(); void assignSplit(int subtaskId, List splits); void signalNoMoreSplits(int subtaskId); void sendEventToSourceReader(int subtaskId, SourceEvent event); } } ``` **Key Responsibilities**: - **Split Discovery**: Generate splits from data source (files, partitions, shards) - **Assignment Strategy**: Decide which splits go to which readers - **Dynamic Handling**: Handle reader registration, split requests, failures - **State Management**: Snapshot remaining splits and assignment state **Implementation Example**: ```java public class JdbcSourceSplitEnumerator implements SourceSplitEnumerator { private final Queue pendingSplits = new LinkedList<>(); private final Set assignedSplits = new HashSet<>(); private final Context context; @Override public void run() throws Exception { // Discover splits by querying database metadata List splits = generateSplitsByPartition(); pendingSplits.addAll(splits); } @Override public void handleSplitRequest(int subtaskId) { // Assign next available split JdbcSourceSplit split = pendingSplits.poll(); if (split != null) { context.assignSplit(subtaskId, Collections.singletonList(split)); assignedSplits.add(split.splitId()); } else { context.signalNoMoreSplits(subtaskId); } } @Override public void addSplitsBack(List splits, int subtaskId) { // Reclaim splits from failed reader pendingSplits.addAll(splits); splits.forEach(split -> assignedSplits.remove(split.splitId())); } @Override public JdbcSourceState snapshotState(long checkpointId) { // Save remaining splits and assignment info return new JdbcSourceState(new ArrayList<>(pendingSplits), assignedSplits); } } ``` ### 3.2 SourceReader Interface The reader runs on workers and performs actual data reading. ```java public interface SourceReader extends AutoCloseable, CheckpointListener { /** * Called when reader starts */ void open() throws Exception; /** * Poll next batch of records (non-blocking or timeout) */ void pollNext(Collector output) throws Exception; /** * Snapshot reader state for checkpoint (typically the current splits/positions). */ List snapshotState(long checkpointId) throws Exception; /** * Add newly assigned splits. */ void addSplits(List splits); /** * Signal no more splits will be assigned. */ void handleNoMoreSplits(); /** * Handle custom event from enumerator */ default void handleSourceEvent(SourceEvent sourceEvent) {} /** * Close reader */ void close() throws IOException; /** * Context for interacting with framework */ interface Context { int getIndexOfSubtask(); Boundedness getBoundedness(); void signalNoMoreElement(); void sendSplitRequest(); void sendSourceEventToEnumerator(SourceEvent sourceEvent); } } ``` **Key Responsibilities**: - **Data Reading**: Pull records from assigned splits - **Progress Tracking**: Track offset/position within each split - **State Management**: Snapshot split progress for recovery - **Split Management**: Handle split assignment, completion, and removal **Implementation Example**: ```java public class JdbcSourceReader implements SourceReader { private final Queue pendingSplits = new LinkedList<>(); private JdbcSourceSplit currentSplit; private ResultSet currentResultSet; @Override public void pollNext(Collector output) throws Exception { if (currentResultSet == null) { // Fetch next split currentSplit = pendingSplits.poll(); if (currentSplit == null) { context.sendSplitRequest(); // Request more splits return; } // Execute query for current split currentResultSet = executeQuery(currentSplit); } // Read batch of rows int count = 0; while (currentResultSet.next() && count++ < BATCH_SIZE) { SeaTunnelRow row = convertToRow(currentResultSet); output.collect(row); } // Check if split completed if (!currentResultSet.next()) { currentResultSet.close(); currentResultSet = null; currentSplit = null; } } @Override public void addSplits(List splits) { pendingSplits.addAll(splits); } @Override public List snapshotState(long checkpointId) { // Save current split and offset List states = new ArrayList<>(); if (currentSplit != null) { states.add(new JdbcSourceState(currentSplit, currentRow)); } pendingSplits.forEach(split -> states.add(new JdbcSourceState(split, 0))); return states; } } ``` ### 3.3 SourceEvent (Custom Communication) Allows enumerator and reader to exchange custom messages. ```java public interface SourceEvent extends Serializable { } // Example: Reader notifies enumerator of discovered partitions public class PartitionDiscoveredEvent implements SourceEvent { private final List newPartitions; } // Example: Enumerator notifies reader of configuration change public class ConfigChangeEvent implements SourceEvent { private final Map newConfig; } ``` **Use Cases**: - Dynamic partition discovery (Kafka, HDFS) - Runtime configuration changes - Custom coordination logic ### 3.4 Code References **API Interfaces**: - [SeaTunnelSource.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SeaTunnelSource.java) - [SourceSplitEnumerator.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceSplitEnumerator.java) - [SourceReader.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceReader.java) - [SourceSplit.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceSplit.java) **Example Implementations**: - JDBC Source: `seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/` - Kafka Source: `seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/` - File Source: `seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/` ## 4. Design Considerations ### 4.1 Design Trade-offs #### Enumerator-Reader Separation **Pros**: - Clean separation of coordination (master) and execution (worker) - Enumerator can reassign splits without reader knowledge - Centralized coordination simplifies split assignment logic - Fault tolerance: enumerator and reader fail independently **Cons**: - Additional network communication (split assignment messages) - More complex API for connector developers - Potential bottleneck if enumerator is slow **Mitigation**: - Asynchronous split assignment - Batch split requests/assignments - Lazy split discovery #### Split Granularity **Coarse-grained splits** (few large splits): - **Pro**: Less coordination overhead - **Con**: Poor load balancing, longer recovery time **Fine-grained splits** (many small splits): - **Pro**: Better load balancing, faster recovery - **Con**: Higher coordination overhead **Guideline**: Choose split granularity based on source capabilities, expected parallelism, and checkpoint/recovery cost. ### 4.2 Performance Considerations #### Batch Reading ```java @Override public void pollNext(Collector output) throws Exception { // Read batch instead of single record for (int i = 0; i < BATCH_SIZE && hasNext(); i++) { output.collect(readNextRow()); } } ``` **Benefits**: - Amortize per-record overhead - Better CPU cache utilization - Reduce lock contention #### Non-blocking Poll ```java @Override public void pollNext(Collector output) throws Exception { // Return immediately if no data available if (!hasNext()) { return; // Framework will call again later } output.collect(readNextRow()); } ``` **Benefits**: - Avoid blocking worker thread - Enable backpressure handling - Better resource utilization #### Connection Pooling ```java public class JdbcSourceReader { private final HikariDataSource dataSource; // Connection pool @Override public void pollNext(Collector output) { try (Connection conn = dataSource.getConnection()) { // Reuse pooled connections } } } ``` ### 4.3 Extensibility #### Custom Split Assignment Strategy ```java public class CustomEnumerator implements SourceSplitEnumerator<...> { @Override public void handleSplitRequest(int subtaskId) { // Custom logic: assign splits based on data locality JdbcSourceSplit split = findClosestSplit(subtaskId); context.assignSplit(subtaskId, Collections.singletonList(split)); } private JdbcSourceSplit findClosestSplit(int subtaskId) { // Check worker location and assign split on same rack/region WorkerLocation location = getWorkerLocation(subtaskId); return pendingSplits.stream() .filter(split -> split.location().equals(location)) .findFirst() .orElse(pendingSplits.poll()); } } ``` #### Dynamic Split Discovery ```java public class KafkaSourceSplitEnumerator { @Override public void run() throws Exception { // Discover initial partitions discoverPartitions(); // Periodically check for new partitions scheduledExecutor.scheduleAtFixedRate( this::discoverPartitions, 60, 60, TimeUnit.SECONDS ); } private void discoverPartitions() { List newPartitions = kafkaAdmin.listPartitions(); // Assign new partitions to readers assignNewPartitions(newPartitions); } } ``` ## 5. Best Practices ### 5.1 Usage Recommendations **1. Split Sizing** - Files: split by file/offset ranges according to file format and I/O characteristics - Databases: split by primary key / partition key ranges (or other stable predicates) - Message queues: use native partitions (e.g., Kafka partitions) **2. State Management** - Keep split/reader state small and stable across versions - Use offsets/positions instead of buffered data - Serialize efficiently (Kryo, Protobuf) **3. Error Handling** ```java @Override public void pollNext(Collector output) throws Exception { try { // Read data } catch (TransientException e) { // Retry transient errors Thread.sleep(1000); retry(); } catch (FatalException e) { // Fatal errors should propagate throw e; } } ``` **4. Resource Management** ```java @Override public void close() throws IOException { // Always close resources if (resultSet != null) resultSet.close(); if (connection != null) connection.close(); if (dataSource != null) dataSource.close(); } ``` ### 5.2 Common Pitfalls **1. Blocking pollNext()** ```java // ❌ BAD: Blocks indefinitely public void pollNext(Collector output) { while (true) { Record record = queue.take(); // Blocks until data available output.collect(record); } } // ✅ GOOD: Non-blocking or timeout public void pollNext(Collector output) { Record record = queue.poll(100, TimeUnit.MILLISECONDS); if (record != null) { output.collect(record); } } ``` **2. Large State** ```java // ❌ BAD: Buffer entire split in state public class BadReaderState { private List bufferedRows; // May be huge! } // ✅ GOOD: Only track offset public class GoodReaderState { private long currentOffset; // Small and efficient } ``` **3. Forgetting to Request Splits** ```java // ❌ BAD: Reader never gets splits public void pollNext(Collector output) { if (pendingSplits.isEmpty()) { return; // Oops, should request more splits! } } // ✅ GOOD: Explicitly request splits public void pollNext(Collector output) { if (pendingSplits.isEmpty()) { context.sendSplitRequest(); return; } } ``` ### 5.3 Debugging Tips **1. Enable Debug Logging** ```java private static final Logger LOG = LoggerFactory.getLogger(JdbcSourceReader.class); public void pollNext(Collector output) { LOG.debug("Polling split: {}, offset: {}", currentSplit.splitId(), currentOffset); // ... } ``` **2. Track Metrics** ```java public class JdbcSourceReader { private long recordsRead = 0; private long bytesRead = 0; public void pollNext(Collector output) { SeaTunnelRow row = readRow(); recordsRead++; bytesRead += row.getBytesSize(); output.collect(row); } } ``` **3. Test Split Reassignment** ```java // Simulate reader failure to test split recovery @Test public void testSplitReassignment() { // Assign splits to reader 0 enumerator.handleSplitRequest(0); // Simulate reader 0 failure enumerator.addSplitsBack(assignedSplits, 0); // New reader 1 should get those splits enumerator.registerReader(1); enumerator.handleSplitRequest(1); // Verify splits were reassigned assertThat(assignedSplits).isNotEmpty(); } ``` ## 6. Related Resources - [Architecture Overview](../overview.md) - [Design Philosophy](../design-philosophy.md) - [Sink Architecture](sink-architecture.md) - [Checkpoint Mechanism](../fault-tolerance/checkpoint-mechanism.md) - [How to Create Your Connector](../../developer/how-to-create-your-connector.md) ## 7. References ### Example Connectors - **Simple Source**: FakeSource (generates test data) - **File Source**: FileSource (local/HDFS/S3 files) - **Database Source**: JdbcSource (JDBC-compatible databases) - **Streaming Source**: KafkaSource (Apache Kafka) - **CDC Source**: MySQLCDCSource (MySQL binlog) ### Further Reading - Apache Flink FLIP-27: ["Refactored Source API"](https://cwiki.apache.org/confluence/display/FLINK/FLIP-27%3A+Refactor+Source+Interface) - Kafka Consumer: [Consumer Groups and Partition Assignment](https://kafka.apache.org/documentation/#consumerconfigs) ================================================ FILE: docs/en/architecture/api-design/translation-layer.md ================================================ --- sidebar_position: 1 title: Translation Layer --- # Translation Layer Architecture ## 1. Overview ### 1.1 Problem Background SeaTunnel provides a unified connector API, but jobs need to run on different execution engines: - **Engine Diversity**: Flink, Spark, SeaTunnel Engine (Zeta) have different APIs - **Code Duplication**: Without translation, each connector needs 3 implementations - **Maintenance Burden**: Bug fixes require changes in all implementations - **API Evolution**: Engine API changes break connectors - **User Experience**: Users want consistent behavior across engines ### 1.2 Design Goals SeaTunnel's translation layer aims to: 1. **Enable Portability**: Same connector runs on any engine 2. **Hide Complexity**: Connector developers only learn SeaTunnel API 3. **Maintain Fidelity**: Preserve semantic guarantees across engines 4. **Minimize Overhead**: Keep translation overhead low (depends on connectors and type conversions) 5. **Support Evolution**: Isolate connectors from engine API changes ### 1.3 Architecture Overview ``` ┌──────────────────────────────────────────────────────────────┐ │ SeaTunnel API Layer │ │ (Engine-Independent Connector Interface) │ │ │ │ SeaTunnelSource SeaTunnelSink SeaTunnelTransform │ └──────────────────────────────────────────────────────────────┘ │ │ Translation Layer ┌─────────────┼─────────────┐ ▼ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ Flink Adapter │ │ Spark Adapter │ │ Zeta (Native) │ │ │ │ │ │ │ │ FlinkSource │ │ SparkSource │ │ Direct │ │ FlinkSink │ │ SparkSink │ │ Execution │ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ │ │ ▼ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ Apache Flink │ │ Apache Spark │ │ SeaTunnel Engine │ │ Runtime │ │ Runtime │ │ (Zeta) │ └──────────────────┘ └──────────────────┘ └──────────────────┘ ``` ## 2. Flink Translation Layer ### 2.1 FlinkSource Adapter Adapts `SeaTunnelSource` to Flink's `Source` interface. ```java public class FlinkSource implements Source, EnumeratorStateWrapper> { // Wrapped SeaTunnel source private final SeaTunnelSource seaTunnelSource; @Override public Boundedness getBoundedness() { // Delegate to SeaTunnel source return seaTunnelSource.getBoundedness() == Boundedness.BOUNDED ? Boundedness.BOUNDED : Boundedness.CONTINUOUS_UNBOUNDED; } @Override public SourceReader> createReader( SourceReaderContext readerContext ) { // Create SeaTunnel reader with adapted context org.apache.seatunnel.api.source.SourceReader seaTunnelReader = seaTunnelSource.createReader(new FlinkSourceReaderContext(readerContext)); // Wrap in Flink adapter return new FlinkSourceReader<>(seaTunnelReader, readerContext); } @Override public SplitEnumerator, EnumeratorStateWrapper> createEnumerator(SplitEnumeratorContext> context) { // Create SeaTunnel enumerator with adapted context SourceSplitEnumerator seaTunnelEnumerator = seaTunnelSource.createEnumerator( new FlinkSourceSplitEnumeratorContext<>(context) ); // Wrap in Flink adapter return new FlinkSourceEnumerator<>(seaTunnelEnumerator, context); } @Override public SimpleVersionedSerializer> getSplitSerializer() { // Adapt SeaTunnel serializer to Flink serializer return new FlinkSimpleVersionedSerializer<>( seaTunnelSource.getSplitSerializer() ); } } ``` ### 2.2 FlinkSourceReader Adapter ```java public class FlinkSourceReader implements SourceReader> { private final org.apache.seatunnel.api.source.SourceReader seaTunnelReader; private final SourceReaderContext flinkContext; @Override public void start() { // Delegate to SeaTunnel reader try { seaTunnelReader.open(); } catch (Exception e) { throw new FlinkRuntimeException("Failed to open SeaTunnel reader", e); } } @Override public InputStatus pollNext(ReaderOutput output) { try { // Adapt output collector CollectorAdapter collector = new CollectorAdapter<>(output); // Poll from SeaTunnel reader seaTunnelReader.pollNext(collector); if (collector.hasRecords()) { return InputStatus.MORE_AVAILABLE; } else { return InputStatus.NOTHING_AVAILABLE; } } catch (Exception e) { throw new FlinkRuntimeException("Failed to poll from SeaTunnel reader", e); } } @Override public void addSplits(List> splits) { // Unwrap and delegate List unwrappedSplits = splits.stream() .map(SplitWrapper::getSplit) .collect(Collectors.toList()); seaTunnelReader.addSplits(unwrappedSplits); } @Override public void notifyCheckpointComplete(long checkpointId) { try { seaTunnelReader.notifyCheckpointComplete(checkpointId); } catch (Exception e) { throw new FlinkRuntimeException("Failed to notify checkpoint complete", e); } } @Override public List> snapshotState(long checkpointId) { try { List state = seaTunnelReader.snapshotState(checkpointId); // Wrap splits for Flink return state.stream() .map(SplitWrapper::new) .collect(Collectors.toList()); } catch (Exception e) { throw new FlinkRuntimeException("Failed to snapshot state", e); } } } ``` ### 2.3 FlinkSourceEnumerator Adapter ```java public class FlinkSourceEnumerator implements SplitEnumerator, EnumeratorStateWrapper> { private final SourceSplitEnumerator seaTunnelEnumerator; private final SplitEnumeratorContext> flinkContext; @Override public void start() { try { seaTunnelEnumerator.open(); seaTunnelEnumerator.run(); } catch (Exception e) { throw new FlinkRuntimeException("Failed to start enumerator", e); } } @Override public void handleSplitRequest(int subtaskId, @Nullable String requesterHostname) { // Delegate to SeaTunnel enumerator seaTunnelEnumerator.handleSplitRequest(subtaskId); } @Override public void addSplitsBack(List> splits, int subtaskId) { // Unwrap and delegate List unwrappedSplits = splits.stream() .map(SplitWrapper::getSplit) .collect(Collectors.toList()); seaTunnelEnumerator.addSplitsBack(unwrappedSplits, subtaskId); } @Override public void addReader(int subtaskId) { seaTunnelEnumerator.addReader(subtaskId); } @Override public EnumeratorStateWrapper snapshotState(long checkpointId) { try { StateT state = seaTunnelEnumerator.snapshotState(checkpointId); return new EnumeratorStateWrapper<>(state); } catch (Exception e) { throw new FlinkRuntimeException("Failed to snapshot enumerator state", e); } } } ``` ### 2.4 Context Adapters **FlinkSourceReaderContext**: ```java public class FlinkSourceReaderContext implements org.apache.seatunnel.api.source.SourceReader.Context { private final SourceReaderContext flinkContext; @Override public int getIndexOfSubtask() { return flinkContext.getIndexOfThisSubtask(); } @Override public void sendSplitRequest() { // Flink automatically handles split requests // No explicit API needed } @Override public void sendSourceEventToEnumerator(SourceEvent event) { flinkContext.sendSourceEventToCoordinator( new SourceEventWrapper(event) ); } } ``` **FlinkSourceSplitEnumeratorContext**: ```java public class FlinkSourceSplitEnumeratorContext implements SourceSplitEnumerator.Context { private final SplitEnumeratorContext> flinkContext; @Override public int currentParallelism() { return flinkContext.currentParallelism(); } @Override public Set registeredReaders() { return flinkContext.registeredReaders().keySet(); } @Override public void assignSplit(int subtaskId, List splits) { // Wrap and delegate List> wrappedSplits = splits.stream() .map(SplitWrapper::new) .collect(Collectors.toList()); flinkContext.assignSplits(new SplitsAssignment<>( Collections.singletonMap(subtaskId, wrappedSplits) )); } @Override public void signalNoMoreSplits(int subtaskId) { flinkContext.signalNoMoreSplits(subtaskId); } @Override public void sendEventToSourceReader(int subtaskId, SourceEvent event) { flinkContext.sendEventToSourceReader(subtaskId, new SourceEventWrapper(event)); } } ``` ### 2.5 FlinkSink Adapter ```java public class FlinkSink implements Sink { private final SeaTunnelSink seaTunnelSink; @Override public SinkWriter createWriter(InitContext context) { // Create SeaTunnel writer with adapted context org.apache.seatunnel.api.sink.SinkWriter seaTunnelWriter = seaTunnelSink.createWriter(new FlinkSinkWriterContext(context)); // Wrap in Flink adapter return new FlinkSinkWriter<>(seaTunnelWriter); } @Override public Optional> createCommitter() { return seaTunnelSink.createCommitter() .map(FlinkCommitter::new); } @Override public Optional> createGlobalCommitter() { return seaTunnelSink.createAggregatedCommitter() .map(FlinkGlobalCommitter::new); } @Override public Optional> getCommittableSerializer() { return seaTunnelSink.getCommitInfoSerializer() .map(FlinkSimpleVersionedSerializer::new); } @Override public Optional> getWriterStateSerializer() { return seaTunnelSink.getWriterStateSerializer() .map(FlinkSimpleVersionedSerializer::new); } } ``` ### 2.6 FlinkSinkWriter Adapter ```java public class FlinkSinkWriter implements SinkWriter { private final org.apache.seatunnel.api.sink.SinkWriter seaTunnelWriter; private long checkpointId; @Override public void write(IN element, Context context) throws IOException { // Delegate to SeaTunnel writer seaTunnelWriter.write(element); } @Override public List prepareCommit(boolean flush) throws IOException { Optional commitInfo = seaTunnelWriter.prepareCommit(checkpointId); return commitInfo.map(Collections::singletonList) .orElse(Collections.emptyList()); } @Override public List snapshotState(long checkpointId) throws IOException { return seaTunnelWriter.snapshotState(checkpointId); } @Override public void close() throws Exception { seaTunnelWriter.close(); } } ``` ## 3. Spark Translation Layer Note: Spark 2.4 and Spark 3.x use different datasource APIs. SeaTunnel maintains separate Spark translation modules/adapters per Spark major version, so the exact adapter types and lifecycle hooks may differ. ### 3.1 SparkSource Adapter Adapts `SeaTunnelSource` to Spark's `DataSourceReader` interface. ```java public class SparkSource implements DataSourceReader { private final SeaTunnelSource seaTunnelSource; @Override public StructType readSchema() { // Convert SeaTunnel schema to Spark schema CatalogTable catalogTable = seaTunnelSource.getProducedCatalogTables().get(0); return SparkTypeConverter.convert(catalogTable.getTableSchema()); } @Override public List> planInputPartitions() { // Create enumerator and generate splits SourceSplitEnumerator enumerator = seaTunnelSource.createEnumerator(new SparkEnumeratorContext()); try { enumerator.open(); enumerator.run(); // Collect all splits List splits = collectAllSplits(enumerator); // Wrap each split as Spark InputPartition return splits.stream() .map(split -> new SparkInputPartition<>(seaTunnelSource, split)) .collect(Collectors.toList()); } catch (Exception e) { throw new RuntimeException("Failed to plan input partitions", e); } } } ``` ### 3.2 SparkInputPartition ```java public class SparkInputPartition implements InputPartition { private final SeaTunnelSource seaTunnelSource; private final SplitT split; @Override public InputPartitionReader createPartitionReader() { // Create SeaTunnel reader org.apache.seatunnel.api.source.SourceReader seaTunnelReader = seaTunnelSource.createReader(new SparkReaderContext()); // Wrap in Spark adapter return new SparkPartitionReader<>(seaTunnelReader, split); } } ``` ### 3.3 SparkPartitionReader ```java public class SparkPartitionReader implements InputPartitionReader { private final org.apache.seatunnel.api.source.SourceReader seaTunnelReader; private final Queue buffer = new LinkedList<>(); public SparkPartitionReader( org.apache.seatunnel.api.source.SourceReader reader, SplitT split ) { this.seaTunnelReader = reader; try { seaTunnelReader.open(); seaTunnelReader.addSplits(Collections.singletonList(split)); } catch (Exception e) { throw new RuntimeException("Failed to open reader", e); } } @Override public boolean next() throws IOException { if (!buffer.isEmpty()) { return true; } // Poll from SeaTunnel reader try { seaTunnelReader.pollNext(new Collector() { @Override public void collect(T record) { // Convert to Spark InternalRow InternalRow row = SparkTypeConverter.convert(record); buffer.offer(row); } }); return !buffer.isEmpty(); } catch (Exception e) { throw new IOException("Failed to poll next", e); } } @Override public InternalRow get() { return buffer.poll(); } @Override public void close() throws IOException { try { seaTunnelReader.close(); } catch (Exception e) { throw new IOException("Failed to close reader", e); } } } ``` ### 3.4 SparkSink Adapter ```java public class SparkSink implements DataSourceWriter { private final SeaTunnelSink seaTunnelSink; @Override public DataWriterFactory createWriterFactory() { return new SparkDataWriterFactory<>(seaTunnelSink); } @Override public boolean useCommitCoordinator() { // Use commit coordinator if sink has committer return seaTunnelSink.createCommitter().isPresent(); } @Override public void commit(WriterCommitMessage[] messages) { Optional> committerOpt = seaTunnelSink.createCommitter(); if (committerOpt.isPresent()) { SinkCommitter committer = committerOpt.get(); // Extract commit infos from messages List commitInfos = Arrays.stream(messages) .map(msg -> ((SparkCommitMessage) msg).getCommitInfo()) .collect(Collectors.toList()); // Commit try { List failed = committer.commit(commitInfos); if (!failed.isEmpty()) { throw new IOException("Some commits failed: " + failed); } } catch (IOException e) { throw new RuntimeException("Failed to commit", e); } } } @Override public void abort(WriterCommitMessage[] messages) { // Handle abort Optional> committerOpt = seaTunnelSink.createCommitter(); if (committerOpt.isPresent()) { SinkCommitter committer = committerOpt.get(); List commitInfos = Arrays.stream(messages) .map(msg -> ((SparkCommitMessage) msg).getCommitInfo()) .collect(Collectors.toList()); try { committer.abort(commitInfos); } catch (IOException e) { throw new RuntimeException("Failed to abort", e); } } } } ``` ## 4. Serialization Adapters ### 4.1 FlinkSimpleVersionedSerializer ```java public class FlinkSimpleVersionedSerializer implements SimpleVersionedSerializer { private final org.apache.seatunnel.api.serialization.Serializer seaTunnelSerializer; @Override public int getVersion() { // Delegate to SeaTunnel serializer return seaTunnelSerializer.getVersion(); } @Override public byte[] serialize(T obj) throws IOException { return seaTunnelSerializer.serialize(obj); } @Override public T deserialize(int version, byte[] serialized) throws IOException { return seaTunnelSerializer.deserialize(serialized); } } ``` ## 5. Type Conversion ### 5.1 Spark Type Conversion ```java public class SparkTypeConverter { public static StructType convert(TableSchema schema) { List fields = new ArrayList<>(); for (Column column : schema.getColumns()) { StructField field = new StructField( column.getName(), convertDataType(column.getDataType()), column.isNullable(), Metadata.empty() ); fields.add(field); } return new StructType(fields.toArray(new StructField[0])); } private static DataType convertDataType(SeaTunnelDataType seaTunnelType) { switch (seaTunnelType.getSqlType()) { case TINYINT: return DataTypes.ByteType; case SMALLINT: return DataTypes.ShortType; case INT: return DataTypes.IntegerType; case BIGINT: return DataTypes.LongType; case FLOAT: return DataTypes.FloatType; case DOUBLE: return DataTypes.DoubleType; case DECIMAL: DecimalType decimalType = (DecimalType) seaTunnelType; return DataTypes.createDecimalType( decimalType.getPrecision(), decimalType.getScale() ); case STRING: return DataTypes.StringType; case BOOLEAN: return DataTypes.BooleanType; case DATE: return DataTypes.DateType; case TIMESTAMP: return DataTypes.TimestampType; case BYTES: return DataTypes.BinaryType; case ARRAY: ArrayType arrayType = (ArrayType) seaTunnelType; return DataTypes.createArrayType( convertDataType(arrayType.getElementType()) ); case MAP: MapType mapType = (MapType) seaTunnelType; return DataTypes.createMapType( convertDataType(mapType.getKeyType()), convertDataType(mapType.getValueType()) ); default: throw new UnsupportedOperationException( "Unsupported type: " + seaTunnelType); } } } ``` ## 6. Performance Considerations ### 6.1 Translation Overhead Translation overhead depends on connector implementations, serialization, and type conversion complexity. Prefer measuring in your own workload rather than relying on fixed numbers. ### 6.2 Optimization Techniques **Batch Type Conversion**: ```java // ❌ BAD: Convert per record public void collect(SeaTunnelRow record) { InternalRow sparkRow = convertToSparkRow(record); output.collect(sparkRow); } // ✅ GOOD: Batch convert (amortize overhead) public void collect(List records) { InternalRow[] sparkRows = batchConvertToSparkRows(records); for (InternalRow row : sparkRows) { output.collect(row); } } ``` **Avoid Unnecessary Wrapping**: ```java // If Split already serializable, don't wrap public class SplitWrapper { private final T split; // Lazy wrapping: only wrap when needed for serialization public byte[] serialize() { if (split instanceof Serializable) { return directSerialize(split); // No wrapping overhead } else { return wrapAndSerialize(split); // Fallback } } } ``` ## 7. Limitations and Workarounds ### 7.1 Engine-Specific Features **Problem**: Some engine features have no SeaTunnel equivalent. **Example**: Flink's `WatermarkStrategy` ```java // Flink-specific watermark strategy cannot be expressed in SeaTunnel API WatermarkStrategy watermarkStrategy = WatermarkStrategy .forBoundedOutOfOrderness(Duration.ofSeconds(5)); ``` **Workaround**: Provide engine-specific configuration ```hocon source { Kafka { # SeaTunnel config topic = "my_topic" # Engine-specific config (for Flink only) flink.watermark.strategy = "bounded-out-of-orderness" flink.watermark.max-out-of-orderness = "5s" } } ``` ### 7.2 Type System Differences **Problem**: Type systems don't fully align. **Example**: Spark has `TimestampType`, Flink has `LocalZonedTimestampType` and `TimestampType`. **Workaround**: Use least common denominator ```java // SeaTunnel uses generic TIMESTAMP // Translation layer maps to appropriate engine type based on config ``` ## 8. Best Practices ### 8.1 Connector Development **DO**: - Implement SeaTunnel API only - Test with multiple engines - Use SeaTunnel types **DON'T**: - Reference engine-specific APIs in connector code - Assume specific engine behavior - Use engine-specific optimizations ### 8.2 Testing **Test on All Engines**: ```java @RunWith(Parameterized.class) public class ConnectorTest { @Parameters public static Collection engines() { return Arrays.asList(new Object[][]{ {"flink"}, {"spark"}, {"seatunnel"} }); } @Test public void testExactlyOnce(String engine) { // Run same test on different engines runJobOnEngine(engine, jobConfig); verifyResults(); } } ``` ## 9. Related Resources - [Source Architecture](../api-design/source-architecture.md) - [Sink Architecture](../api-design/sink-architecture.md) - [Design Philosophy](../design-philosophy.md) ## 10. References ### Key Source Files - Flink Translation: `seatunnel-translation/seatunnel-translation-flink/` - Spark Translation: `seatunnel-translation/seatunnel-translation-spark/` - Base Interfaces: `seatunnel-api/src/main/java/org/apache/seatunnel/api/` ### Further Reading - [Apache Flink Source API](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/datastream/sources/) - [Apache Spark Data Source V2](https://spark.apache.org/docs/latest/sql-data-sources.html) ================================================ FILE: docs/en/architecture/design-philosophy.md ================================================ --- sidebar_position: 2 title: Design Philosophy --- # SeaTunnel Design Philosophy ## 1. Overview This document explains the core design principles, philosophies, and trade-offs that shaped SeaTunnel's architecture. Understanding these principles helps contributors make consistent design decisions and users understand the system's strengths and limitations. ## 2. Core Design Principles ### 2.1 Engine Independence **Principle**: Decouple connector logic from execution engines. **Motivation**: - Users may have existing infrastructure investments (Flink, Spark clusters) - Different engines suit different scenarios (batch vs streaming, resource constraints) - Connector developers shouldn't need to understand multiple engine APIs **Implementation**: - Unified SeaTunnel API layer abstracts engine-specific details - Translation layer adapts SeaTunnel API to engine-specific APIs - Aim for maximum connector reuse across engines (some engine-specific adaptation may still be required via the translation layer) **Trade-offs**: - **Pro**: High reusability - write once, run across engines via adapters - **Pro**: Easier connector development - single API to learn - **Con**: Cannot leverage engine-specific optimizations - **Con**: Additional translation overhead - **Mitigation**: Translation layer is thin and optimized; most overhead is in I/O, not translation **Example**: Connectors only implement SeaTunnel API abstractions (Source/Sink/Transform), and different execution engines complete adaptation through the translation layer; thus connector logic is decoupled from engine API changes. ### 2.2 Separation of Coordination and Execution **Principle**: Separate control logic (coordination) from data processing (execution). **Motivation**: - Coordination logic is single-threaded and lightweight - Execution logic is parallel and resource-intensive - Fault tolerance requires independent state management for each **Implementation Principle**: **Coordination Layer (Master-side)**: - Location: Runs on master nodes with global view - Core Responsibilities: Resource discovery, work distribution, failure detection, state coordination - Characteristics: Single-threaded, lightweight, no actual data processing - Managed State: Assignment plan, pending work units, global progress tracking **Execution Layer (Worker-side)**: - Location: Runs on worker nodes with independent parallel execution - Core Responsibilities: Local data processing, progress reporting, checkpoint participation - Characteristics: Multi-threaded, resource-intensive, handles large data volumes - Managed State: Local processing progress, buffered data, execution context **Communication Mechanism**: - Coordination layer → Execution layer: Dispatches work via events (e.g., assign new data splits) - Execution layer → Coordination layer: Reports progress via messages (e.g., split completed, request new work) - During checkpoints: Each layer snapshots its own state independently **Trade-offs**: - **Pro**: Clear separation of concerns - **Pro**: Enumerator can reassign splits on failures - **Pro**: Committer enables global transaction coordination - **Con**: Additional communication overhead - **Con**: More complex API for connector developers - **Mitigation**: Reasonable defaults; simple connectors can use trivial enumerators/committers **Example**: - Master side: Responsible for "discovering/generating work units (splits) + assignment + reclamation + state snapshots" - Worker side: Responsible for "executing reads/writes + progress reporting + checkpoint participation" The key reason for this design: Fault tolerance requires distinguishing between "control state" (assigned/pending splits) and "execution progress" (offset/position per split) to enable precise recovery and fast reassignment after failures. ### 2.3 Split-based Parallelism **Principle**: Divide data sources into independently processable splits. **Motivation**: - Enable parallel processing without tight coordination - Support dynamic load balancing and fault recovery - Provide checkpoint granularity (per-split progress) **Implementation**: - Data sources divided into splits (file blocks, DB partitions, Kafka partitions, etc.) - Enumerator generates splits lazily or eagerly - Readers process splits independently - Unprocessed splits can be reassigned on failure **Trade-offs**: - **Pro**: Excellent scalability - add workers to process more splits - **Pro**: Fine-grained fault recovery - only failed splits need reprocessing - **Pro**: Dynamic load balancing - assign more splits to idle workers - **Con**: Split generation overhead for some sources - **Con**: Requires state tracking per split - **Mitigation**: Lazy split generation; split state is lightweight **Example**: ```java // JDBC Source: Split by partition or chunk class JdbcSourceSplit implements SourceSplit { private final String splitId; private final String query; // SELECT * FROM table WHERE id >= ? AND id < ? private final long startOffset; private final long endOffset; } // File Source: Split by file or byte range class FileSplit implements SourceSplit { private final String filePath; private final long startOffset; private final long length; } ``` ### 2.4 Exactly-Once Semantics through Two-Phase Commit **Principle**: Guarantee exactly-once end-to-end data delivery. **Motivation**: - Data integration must not lose or duplicate data - Failures can occur at any time (network, process crashes) - External systems require transactional guarantees **Implementation Principle**: Two-phase commit protocol separates data writing into two independent phases: 1. **Prepare Phase**: - Timing: Triggered when checkpoint barrier arrives - Action: Writer generates "committable but not yet committed" credentials (e.g., transaction ID, temp file path) - Constraint: No externally visible side effects (data not visible to external systems) - State: Credential information persisted with checkpoint 2. **Commit Phase**: - Timing: After checkpoint completes successfully - Action: Coordinator atomically commits changes using credentials (e.g., commit transaction, move files) - Effect: Data becomes visible to external systems - Guarantee: Idempotent - repeated commits have no side effects 3. **Abort Handling**: - Timing: When checkpoint fails or times out - Action: Clean up temporary resources from prepare phase (e.g., rollback transaction, delete temp files) - Effect: Ensures no partial writes or inconsistent state **Trade-offs**: - **Pro**: Strong consistency guarantee - **Pro**: Automatic recovery from failures - **Con**: Requires transactional support in sinks (or idempotent operations) - **Con**: Increased latency (data visible only after commit) - **Con**: Additional state for commit info - **Mitigation**: Optional feature; at-least-once mode available for non-transactional sinks **Example**: A typical exactly-once implementation follows this pattern: "the writer first generates committable credentials (commit info), and after checkpoint succeeds, the coordinator performs the final commit". This approach delays side effects (visible changes to external systems) until after checkpoint success, avoiding duplicate visible writes during failure recovery. ### 2.5 Schema as First-Class Citizen **Principle**: Treat schema as explicit, typed metadata propagated through pipelines. **Motivation**: - Data integration requires schema transformation and validation - Schema evolution (DDL changes) must be handled explicitly - Type mismatches should be caught early **Implementation**: - `CatalogTable` encapsulates complete table metadata - `TableSchema` defines structure (columns, primary key, constraints) - Schema propagated through Source → Transform → Sink - `SchemaChangeEvent` represents DDL changes (ADD/DROP/MODIFY columns) **Trade-offs**: - **Pro**: Type safety - validate schema at job submission - **Pro**: Schema evolution - handle DDL changes at runtime - **Pro**: Better error messages - schema mismatches detected early - **Con**: Additional complexity for schema-less sources - **Con**: Schema discovery overhead for some sources - **Mitigation**: Schema inference helpers; optional schema override **Example**: ```java // Source produces typed schema CatalogTable catalogTable = CatalogTable.of( tableId, TableSchema.builder() .column("id", DataTypes.BIGINT()) .column("name", DataTypes.STRING()) .primaryKey("id") .build() ); // Transform validates and modifies schema public CatalogTable getProducedCatalogTable() { return inputCatalogTable.copy( TableSchema.builder() .column("id", DataTypes.BIGINT()) .column("name_upper", DataTypes.STRING()) // Transformed .build() ); } ``` ### 2.6 Plugin Architecture with Class Loader Isolation **Principle**: Connectors are plugins loaded dynamically with isolated dependencies. **Motivation**: - Avoid dependency conflicts (e.g., multiple JDBC driver versions) - Enable hot-pluggable connectors without core rebuild - Reduce core distribution size **Implementation**: - Java SPI for connector discovery - Each connector has isolated class loader - Shade plugin dependencies to avoid conflicts - Factory pattern for instantiation **Trade-offs**: - **Pro**: Dependency isolation - no version conflicts - **Pro**: Smaller core distribution - **Pro**: Easy to add third-party connectors - **Con**: Class loader complexity - **Con**: Some shared libraries (e.g., Guava) may have issues - **Mitigation**: Careful shading; shared common libraries in core **Example**: ``` seatunnel-engine/lib/ # Core libraries connector-jdbc/lib/ # JDBC driver (isolated) connector-kafka/lib/ # Kafka client (isolated) # Each connector loaded by separate ClassLoader ConnectorClassLoader(connector-jdbc) -> loads mysql-connector-java-8.0.26.jar ConnectorClassLoader(connector-kafka) -> loads kafka-clients-3.0.0.jar ``` ### 2.7 State Management with Checkpoint Storage Abstraction **Principle**: Decouple state management from storage implementation. **Motivation**: - Different deployments need different storage (HDFS, S3, local, OSS) - State size varies widely (KBs to TBs) - Storage durability and performance requirements differ **Implementation**: - `CheckpointStorage` abstraction (FileSystem, HDFS, S3, OSS) - Pluggable serialization for state - Incremental checkpoint support - Automatic state cleanup **Trade-offs**: - **Pro**: Flexibility - choose storage based on deployment - **Pro**: Incremental checkpoints reduce overhead - **Con**: Storage performance impacts checkpoint latency - **Con**: Requires distributed file system for production - **Mitigation**: Async checkpoint upload; configurable intervals ### 2.8 Multi-Table Synchronization **Principle**: Support synchronizing multiple tables in a single job. **Motivation**: - Database migration often involves hundreds of tables - Creating one job per table wastes resources - Schema evolution must apply to all tables **Implementation**: - `MultiTableSource` / `MultiTableSink` wrap individual table sources/sinks - `TablePath` routes records to correct table - Schema changes propagated per table - Replica support for throughput **Trade-offs**: - **Pro**: Resource efficiency - one job instead of hundreds - **Pro**: Consistent snapshot across tables - **Pro**: Centralized monitoring - **Con**: One table failure can affect others - **Con**: More complex error handling - **Mitigation**: Configurable error tolerance; per-table metrics ## 3. Architectural Trade-offs ### 3.1 Simplicity vs Performance **Choice**: Favor simplicity and correctness over extreme performance optimization. **Rationale**: - Data integration is I/O-bound, not CPU-bound - Correct semantics (exactly-once) more critical than raw speed - Simple code is maintainable and debuggable **Evidence**: - Network and disk I/O dominate processing time (> 90%) - Translation layer overhead is negligible (< 1%) - Code readability prioritized (e.g., clear state machine, no micro-optimizations) ### 3.2 Flexibility vs Ease of Use **Choice**: Provide reasonable defaults while allowing advanced customization. **Rationale**: - Most users want simple configuration - Power users need fine-grained control - Both needs can be met with layered API **Implementation**: - High-level config for common cases (e.g., `jdbc://host:port/db`) - Low-level options for experts (e.g., connection pool tuning) - Sensible defaults (parallelism, checkpoint interval, buffer size) ### 3.3 Generality vs Specialization **Choice**: General-purpose API with specialized implementations. **Rationale**: - Unified API simplifies learning and usage - Different sources have unique characteristics (bounded vs unbounded, splitability) - Specialization happens in connector implementations, not API **Example**: - `SourceSplitEnumerator` general enough for files, databases, and message queues - File connector uses file-based splits - Kafka connector uses partition-based splits - JDBC connector uses query-based splits ### 3.4 Strong Consistency vs Latency **Choice**: Offer both exactly-once (high latency) and at-least-once (low latency) modes. **Rationale**: - Some applications require strong consistency (financial, billing) - Other applications tolerate duplicates for lower latency (logging, metrics) - Let users choose based on requirements **Configuration**: ```hocon env { checkpoint.mode = "EXACTLY_ONCE" # or "AT_LEAST_ONCE" checkpoint.interval = 60000 # ms } ``` ## 4. Evolution from V1 to V2 ### 4.1 V1 Limitations SeaTunnel V1 (pre-2.3.0) had significant architectural limitations: 1. **Engine-Specific Connectors**: Separate implementations for Spark and Flink 2. **No Unified API**: No abstraction layer, tight coupling to engines 3. **Limited Fault Tolerance**: Relied entirely on engine checkpointing 4. **No Schema Management**: Schema implicit, no evolution support 5. **Single-Table Only**: Multi-table synchronization not supported ### 4.2 V2 Improvements SeaTunnel V2 (2.3.0+) redesigned the architecture: | Aspect | V1 | V2 | |--------|----|----| | **API** | Engine-specific | Unified SeaTunnel API | | **Connectors** | Duplicated code | Single implementation | | **Fault Tolerance** | Engine-dependent | Explicit checkpoint protocol | | **Schema** | Implicit | Explicit CatalogTable | | **Multi-Table** | Not supported | Native support | | **Engine Support** | Spark, Flink | Spark, Flink, Zeta | | **Exactly-Once** | Partial | End-to-end with 2PC | ### 4.3 Migration Path V1 and V2 connectors coexist but use different APIs: - V1 connectors: `seatunnel-connectors/` (deprecated) - V2 connectors: `seatunnel-connectors-v2/` (recommended) V2 is the future; V1 is in maintenance mode. ## 5. Key Design Decisions ### 5.1 Why Separate Enumerator and Reader? **Alternative**: Single component handles both split generation and reading. **Decision**: Separate components. **Reasoning**: - Split generation is coordination logic (should run on master) - Data reading is execution logic (should run on workers) - Failure of one shouldn't affect the other - Allows split reassignment without reader restart ### 5.2 Why Three-Level Sink Commit (Writer → Committer → AggregatedCommitter)? **Alternative**: Two-level (Writer → Committer) or direct Writer commit. **Decision**: Optional three-level commit. **Reasoning**: - **Writer**: Parallel, stateful, per-task - **Committer**: Parallel, stateless, aggregates per-writer commits - **AggregatedCommitter**: Single-threaded, stateful, global coordinator Many sinks only need Writer + Committer; AggregatedCommitter is for complex cases (e.g., Hive table commit requiring single global operation). ### 5.3 Why LogicalDag → PhysicalPlan Separation? **Alternative**: Directly generate physical execution plan from config. **Decision**: Two-stage planning. **Reasoning**: - LogicalDag represents user intent (portable, engine-independent) - PhysicalPlan represents execution strategy (engine-specific, optimized) - Separation enables: - Cross-engine portability (same LogicalDag, different PhysicalPlans) - Optimization passes (fusion, split reassignment) - Testing (validate logical plan separately) ### 5.4 Why Pipeline-based Execution? **Alternative**: Single global task graph. **Decision**: Jobs divided into pipelines. **Reasoning**: - Independent checkpoint coordination per pipeline - Clearer failure boundaries - Easier to reason about data flow - Supports complex DAGs (multiple sources/sinks) ### 5.5 Why Not Use Engine-Native Checkpoint? **Alternative**: Rely entirely on Flink/Spark checkpoint mechanisms. **Decision**: Explicit SeaTunnel checkpoint protocol. **Reasoning**: - Engine independence - need consistent semantics across engines - Zeta engine wouldn't have checkpointing otherwise - More control over exactly-once semantics - Unified monitoring and observability However, for Flink translation, SeaTunnel checkpoints align with Flink checkpoints to avoid duplication. ## 6. Lessons Learned ### 6.1 What Worked Well 1. **Engine Independence**: Validated by successful Zeta engine addition without API changes 2. **Split-based Parallelism**: Scales well to 1000+ parallel tasks 3. **Explicit Schema**: Caught many bugs early, enabled schema evolution 4. **Two-Phase Commit**: Reliable exactly-once semantics ### 6.2 What Could Be Better 1. **API Complexity**: Enumerator/Committer adds learning curve for simple connectors 2. **Class Loader Issues**: Occasional conflicts with shaded dependencies 3. **Checkpoint Latency**: Large state causes checkpoint delays 4. **Documentation Gaps**: Architecture docs lagged behind code ### 6.3 If Starting Over 1. **Simplify API**: Provide higher-level abstractions for simple sources/sinks 2. **Async I/O Support**: First-class async API for non-blocking connectors 3. **Built-in Metrics**: Standardized metrics collection in API 4. **Schema Registry Integration**: Tighter integration with external schema registries ## 7. Conclusion SeaTunnel's architecture reflects careful trade-offs between competing concerns: - Engine independence vs engine-specific optimization - Simplicity vs flexibility - Consistency vs latency - Generality vs specialization The V2 redesign addressed major V1 limitations while establishing principles for long-term evolution. Understanding these design philosophies helps contributors make consistent decisions and users understand SeaTunnel's strengths and appropriate use cases. ## 8. References - [Architecture Overview](overview.md) - [Source Architecture](api-design/source-architecture.md) - [Sink Architecture](api-design/sink-architecture.md) - [Checkpoint Mechanism](fault-tolerance/checkpoint-mechanism.md) ### Academic Papers - Chandy-Lamport: ["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Flink: ["Apache Flink: Stream and Batch Processing in a Single Engine"](https://asterios.katsifodimos.com/assets/publications/flink-deb.pdf) ================================================ FILE: docs/en/architecture/engine/dag-execution.md ================================================ --- sidebar_position: 2 title: DAG Execution Model --- # DAG Execution Model ## 1. Overview ### 1.1 Problem Background Distributed data processing requires transforming user intentions into executable distributed tasks: - **Abstraction Levels**: How to separate logical intent from physical execution? - **Optimization**: How to optimize task placement and data shuffling? - **Pipeline**: How to execute complex DAGs with multiple sources/sinks? - **Parallelism**: How to determine task parallelism and distribution? - **Fault Isolation**: How to limit failure impact to affected components? ### 1.2 Design Goals SeaTunnel's DAG execution model aims to: 1. **Separate Concerns**: Logical planning (user intent) vs physical execution (runtime details) 2. **Enable Optimization**: Task fusion, pipeline分割, resource allocation 3. **Support Complex Topologies**: Multiple sources, sinks, branches, joins 4. **Facilitate Fault Tolerance**: Clear failure boundaries with independent checkpoints 5. **Maximize Parallelism**: Efficient parallel execution with minimal coordination ### 1.3 Execution Model Overview ``` User Config (HOCON) │ ▼ ┌─────────────────────┐ │ LogicalDag │ Logical Plan (What to do) │ • LogicalVertex │ - Source/Transform/Sink actions │ • LogicalEdge │ - Data dependencies │ • Parallelism │ - Logical parallelism └─────────────────────┘ │ (Plan Generation) ▼ ┌─────────────────────┐ │ PhysicalPlan │ Physical Plan (How to execute) │ • SubPlan[] │ - Multiple pipelines │ • Resources │ - Resource requirements │ • Scheduling │ - Deployment strategy └─────────────────────┘ │ (Pipeline Split) ▼ ┌─────────────────────┐ │ SubPlan (Pipeline) │ Independent Execution Unit │ • PhysicalVertex[] │ - Parallel task instances │ • CheckpointCoord │ - Independent checkpointing │ • PipelineLocation │ - Unique identifier └─────────────────────┘ │ (Task Deployment) ▼ ┌─────────────────────┐ │ PhysicalVertex │ Deployed Task Group │ • TaskGroup │ - Co-located tasks (fusion) │ • SlotProfile │ - Assigned resource slot │ • ExecutionState │ - Running state └─────────────────────┘ │ (Execution) ▼ ┌─────────────────────┐ │ SeaTunnelTask │ Actual Execution │ • Source/Transform │ - Data processing │ • /Sink Logic │ - State management └─────────────────────┘ ``` ## 2. LogicalDag: User Intent ### 2.1 Structure LogicalDag represents the user's job configuration in an engine-independent way. ```java public class LogicalDag { // Vertices: Source, Transform, Sink actions private final Map logicalVertexMap; // Edges: Data flow dependencies private final Set edges; // Job configuration private final JobConfig jobConfig; } ``` ### 2.2 LogicalVertex Represents a single action (Source/Transform/Sink) with parallelism. ```java public class LogicalVertex { private final long vertexId; private final Action action; // SourceAction, TransformChainAction, SinkAction private final int parallelism; // Number of parallel instances } ``` **Action Types**: - **SourceAction**: Wraps `SeaTunnelSource`, produces `CatalogTable` - **TransformChainAction**: Chain of `SeaTunnelTransform`, transforms schema - **SinkAction**: Wraps `SeaTunnelSink`, consumes `CatalogTable` **Example**: ```java // From config: // source { JDBC { ... parallelism = 4 } } // transform { Sql { ... parallelism = 8 } } // sink { Elasticsearch { ... parallelism = 2 } } LogicalVertex sourceVertex = new LogicalVertex( vertexId: 1, action: new SourceAction(jdbcSource), parallelism: 4 ); LogicalVertex transformVertex = new LogicalVertex( vertexId: 2, action: new TransformChainAction(sqlTransform), parallelism: 8 ); LogicalVertex sinkVertex = new LogicalVertex( vertexId: 3, action: new SinkAction(esSink), parallelism: 2 ); ``` ### 2.3 LogicalEdge Represents data flow between actions. ```java public class LogicalEdge { private final long inputVertexId; // Upstream vertex private final long targetVertexId; // Downstream vertex } ``` **Example**: ```java // Source → Transform edge LogicalEdge edge1 = new LogicalEdge( inputVertexId: 1, // JDBC Source targetVertexId: 2 // SQL Transform ); // Transform → Sink edge LogicalEdge edge2 = new LogicalEdge( inputVertexId: 2, // SQL Transform targetVertexId: 3 // Elasticsearch Sink ); ``` ### 2.4 LogicalDag Creation Built from user configuration: ```java // JobMaster creates LogicalDag LogicalDag logicalDag = LogicalDagGenerator.generate(jobConfig); ``` **Process**: 1. Parse HOCON config (source, transform, sink sections) 2. Create `Action` objects for each configured component 3. Infer data flow from config structure 4. Validate schema compatibility 5. Build `LogicalDag` object **Example Config → LogicalDag**: ```hocon env { parallelism = 4 } source { JDBC { url = "jdbc:mysql://..." query = "SELECT * FROM orders" } } transform { Sql { query = "SELECT order_id, SUM(amount) FROM this GROUP BY order_id" } } sink { Elasticsearch { hosts = ["es-host:9200"] index = "orders_summary" } } ``` Generated LogicalDag: ``` Vertex 1 (JDBC Source, parallelism=4) │ ▼ Vertex 2 (SQL Transform, parallelism=4) │ ▼ Vertex 3 (Elasticsearch Sink, parallelism=4) ``` ## 3. PhysicalPlan: Execution Strategy ### 3.1 Structure PhysicalPlan describes how to execute the LogicalDag on distributed workers. ```java public class PhysicalPlan { // Multiple pipelines (SubPlans) private final List pipelineList; // Immutable job information private final JobImmutableInformation jobImmutableInformation; // Distributed state (Hazelcast IMap) private final IMap runningJobStateIMap; private final IMap runningJobStateTimestampsIMap; // Job completion future private final CompletableFuture jobEndFuture; } ``` ### 3.2 Pipeline Splitting A LogicalDag is split into multiple **Pipelines** (SubPlans) by the current `PipelineGenerator` implementation: 1. **Unrelated Subgraphs**: Disconnected parts of the DAG become independent pipelines 2. **Multiple-Input Vertices**: If a connected subgraph contains a vertex with multiple upstream inputs, the generator splits the subgraph into multiple linear pipelines along each source→sink path and clones vertices where needed **Note**: Multiple sinks (branching) do not necessarily create multiple pipelines. When there is no multiple-input vertex, a branching graph is usually kept as a single pipeline. **Example 1: Simple Linear Pipeline**: ```hocon source { JDBC { } } transform { Sql { } } sink { Elasticsearch { } } ``` Generated: **1 Pipeline** ``` Pipeline 1: [JDBC Source] → [SQL Transform] → [Elasticsearch Sink] ``` **Example 2: Multiple Sources**: ```hocon source { JDBC { plugin_output = "orders" } Kafka { plugin_output = "events" } } transform { Sql { query = "SELECT * FROM orders UNION SELECT * FROM events" } } sink { Elasticsearch { } } ``` Generated: **2 Pipelines** ``` Pipeline 1: [JDBC Source] → [SQL Transform] → [Elasticsearch Sink] Pipeline 2: [Kafka Source] → [SQL Transform] → [Elasticsearch Sink] ``` **Example 3: Multiple Sinks**: ```hocon source { MySQL-CDC { } } sink { Elasticsearch { plugin_input = "MySQL-CDC" } JDBC { plugin_input = "MySQL-CDC" } } ``` Generated: **1 Pipeline** ``` Pipeline 1: [MySQL-CDC Source] → ([Elasticsearch Sink], [JDBC Sink]) ``` ### 3.3 PhysicalPlan Generation ```java // In JobMaster PhysicalPlan physicalPlan = new PhysicalPlanGenerator(logicalDag, resourceManager) .generate(); ``` **Steps**: 1. **Analyze LogicalDag**: Identify sources, sinks, and dependencies 2. **Split into Pipelines**: Create SubPlan for each pipeline 3. **Generate PhysicalVertices**: Create parallel instances for each action 4. **Allocate Resources**: Request slots from ResourceManager 5. **Assign Tasks**: Map PhysicalVertices to slots 6. **Create Coordinators**: Setup CheckpointCoordinator per pipeline ## 4. SubPlan (Pipeline) ### 4.1 Structure SubPlan represents an independently executing pipeline. ```java public class SubPlan { private final int pipelineId; private final PipelineLocation pipelineLocation; // All task instances in this pipeline private final List physicalVertexList; // Coordinator tasks (Enumerator, Committer) private final List coordinatorVertexList; // Checkpoint coordinator for this pipeline private final CheckpointCoordinator checkpointCoordinator; // Execution state private PipelineStatus pipelineStatus; } ``` ### 4.2 PhysicalVertex List Each LogicalVertex with parallelism N generates N PhysicalVertices. **Example**: ``` LogicalVertex: JDBC Source (parallelism = 4) ↓ PhysicalVertices: - PhysicalVertex (subtask 0, slot 1) - PhysicalVertex (subtask 1, slot 2) - PhysicalVertex (subtask 2, slot 3) - PhysicalVertex (subtask 3, slot 4) ``` ### 4.3 Coordinator Vertices Special vertices for coordination tasks: - **SourceSplitEnumerator**: Runs on master, assigns splits to readers - **SinkCommitter**: Runs on master, coordinates commits - **SinkAggregatedCommitter**: Runs on master, global commit coordination **Example**: ``` SubPlan for JDBC → Transform → Elasticsearch: physicalVertexList: - JdbcSourceTask (4 instances) - TransformTask (4 instances) - ElasticsearchSinkTask (4 instances) coordinatorVertexList: - JdbcSourceSplitEnumerator (1 instance, master) - ElasticsearchSinkCommitter (1 instance, master) ``` ### 4.4 Independent Checkpointing Each pipeline has its own `CheckpointCoordinator`: **Benefits**: - Independent checkpoint intervals - Isolated failure domains - Reduced coordination overhead - Simpler barrier alignment **Example**: ``` Pipeline 1 (JDBC → ES): CheckpointCoordinator triggers every 60s Manages checkpoints for JDBC and ES tasks only Pipeline 2 (Kafka → JDBC): CheckpointCoordinator triggers every 30s (different interval) Manages checkpoints for Kafka and JDBC tasks only ``` ## 5. PhysicalVertex: Deployed Task ### 5.1 Structure PhysicalVertex represents a deployed task instance. ```java public class PhysicalVertex { private final TaskGroupLocation taskGroupLocation; private final TaskGroupDefaultImpl taskGroup; // Assigned resource slot private final SlotProfile slotProfile; // Execution state (CREATED, RUNNING, FAILED, etc.) private ExecutionState currentExecutionState; // Plugin jars (for class loader isolation) private final List> pluginJarsUrls; } ``` ### 5.2 TaskGroup: Task Fusion Multiple tasks can be fused into a single `TaskGroup` for efficiency. ```java public class TaskGroupDefaultImpl implements TaskGroup { private final TaskGroupLocation taskGroupLocation; // Multiple tasks in this group private final Set tasks; // Shared thread pool private final ExecutorService executorService; // Shared network buffers private final Map>> internalChannels; } ``` **Fusion Conditions**: 1. Same parallelism 2. Sequential dependency (A → B) 3. No data shuffle required **Example (with fusion)**: ``` LogicalDag: Source (parallelism=4) → Transform (parallelism=4) → Sink (parallelism=4) Without Fusion: 12 separate tasks (4 + 4 + 4) Network overhead for Source → Transform and Transform → Sink With Fusion: 4 TaskGroups, each containing: [SourceTask → TransformTask → SinkTask] (single thread, shared memory) ``` **Benefits**: - Reduced network serialization/deserialization - Better CPU cache locality - Lower memory footprint - Simplified deployment ### 5.3 Slot Assignment Each PhysicalVertex is assigned a `SlotProfile`: ```java public class SlotProfile { private final long slotID; private final Address workerAddress; private final ResourceProfile resourceProfile; // CPU, memory } ``` **Assignment Process**: 1. JobMaster requests slots from ResourceManager 2. ResourceManager selects workers based on strategy (random, slot ratio, load) 3. ResourceManager allocates slots and returns SlotProfiles 4. JobMaster assigns SlotProfiles to PhysicalVertices 5. JobMaster deploys tasks via `DeployTaskOperation` ## 6. Task Deployment and Execution ### 6.1 Deployment Flow ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as Worker Node participant Task as SeaTunnelTask JM->>JM: Generate PhysicalPlan JM->>RM: applyResources(resourceProfiles) RM->>RM: Allocate slots RM-->>JM: Return SlotProfiles JM->>JM: Assign slots to PhysicalVertices loop For each PhysicalVertex JM->>Worker: DeployTaskOperation(taskGroup) Worker->>Task: Create SeaTunnelTask Task->>Task: INIT → WAITING_RESTORE Task->>JM: Report ready end JM->>Worker: Start execution Worker->>Task: READY_START → STARTING → RUNNING ``` ### 6.2 Task Execution Each `SeaTunnelTask` executes its assigned action: **SourceSeaTunnelTask**: ```java while (isRunning()) { // Poll data from SourceReader sourceReader.pollNext(collector); // Handle checkpoint barriers if (checkpointTriggered) { triggerBarrier(checkpointId); } } ``` **TransformSeaTunnelTask**: ```java while (isRunning()) { // Read from input queue Record record = inputQueue.take(); // Apply transform Record transformed = transform.map(record); // Write to output queue outputQueue.put(transformed); } ``` **SinkSeaTunnelTask**: ```java while (isRunning()) { // Read from input queue Record record = inputQueue.take(); // Write to sink sinkWriter.write(record); // Handle checkpoint barriers if (barrierReceived) { commitInfo = sinkWriter.prepareCommit(checkpointId); snapshotState(checkpointId); } } ``` ## 7. Optimization Strategies ### 7.1 Task Fusion **When to Fuse**: - Same parallelism - Sequential operators (no branching) - No shuffle boundary **When NOT to Fuse**: - Different parallelism (e.g., source=4, sink=8) - Branching DAG (one source, multiple sinks) - Shuffle required (e.g., GROUP BY, JOIN) Task fusion behavior and controls are engine-implementation specific. Avoid relying on undocumented `env.job.mode` values in architecture examples. ### 7.2 Parallelism Inference Parallelism resolution (SeaTunnel Engine / Zeta): - If an action/connector config specifies `parallelism`, it takes precedence - Otherwise use `env.parallelism` (default is `1`) **Example**: ```hocon env { parallelism = 1 } source { JDBC { parallelism = 4 } # Explicit } transform { Sql { } # Inferred: 4 (from source) } sink { Elasticsearch { } # Inferred: 4 (from transform) } ``` ### 7.3 Resource Allocation **Slot Calculation**: ``` Required Slots = Sum of all task parallelism Example: Source (parallelism=4) + Transform (parallelism=4) + Sink (parallelism=2) = 10 slots required With Fusion: TaskGroup (parallelism=4, fusion[Source+Transform]) + Sink (parallelism=2) = 6 slots required ``` **Resource Profile**: ```java ResourceProfile profile = new ResourceProfile( CPU.of(1), // 1 CPU core Memory.of(512 * 1024 * 1024L) // 512MB heap (bytes) ); ``` ## 8. Failure Handling ### 8.1 Task Failure **Detection**: - Task throws exception - Heartbeat timeout **Recovery**: 1. Mark task as FAILED 2. Fail entire pipeline (conservative) 3. Restore from latest checkpoint 4. Reallocate resources 5. Redeploy and restart pipeline ### 8.2 Pipeline Failure Isolation **Key Insight**: Pipeline failures are isolated. **Example**: ``` Job with 2 pipelines: Pipeline 1: JDBC → ES (RUNNING) Pipeline 2: Kafka → JDBC (FAILED) Result: Pipeline 2 restarts from checkpoint Pipeline 1 continues unaffected ``` **Benefits**: - Reduced blast radius - Faster recovery (only failed pipeline) - Better resource utilization ## 9. Monitoring and Observability ### 9.1 Key Metrics **Pipeline-Level**: - Pipeline status and lifecycle transitions (CREATED / RUNNING / FINISHED / FAILED) - Task counts and placement across workers/slots - Checkpoint progress (latest checkpoint id, duration, failures) **Task-Level**: - Task status and restart counters - Record/byte throughput (in/out) - Backpressure / queueing indicators (engine-dependent) ### 9.2 Visualization ``` Job: mysql-to-es │ ├── Pipeline 1 (mysql-cdc → elasticsearch) │ ├── PhysicalVertex 0 [RUNNING] @ worker-1:slot-1 │ ├── PhysicalVertex 1 [RUNNING] @ worker-2:slot-1 │ ├── PhysicalVertex 2 [RUNNING] @ worker-3:slot-1 │ └── PhysicalVertex 3 [RUNNING] @ worker-4:slot-1 │ └── Pipeline 2 (mysql-cdc → jdbc) ├── PhysicalVertex 0 [RUNNING] @ worker-1:slot-2 └── PhysicalVertex 1 [RUNNING] @ worker-2:slot-2 ``` ## 10. Best Practices ### 10.1 Parallelism Configuration **Rule of Thumb**: ``` Parallelism = min( data partitions, available slots, target throughput / single-task throughput ) ``` **Examples**: - **JDBC Source**: Set to number of DB partitions (e.g., 8 partitions → parallelism=8) - **Kafka Source**: Set to number of partitions (e.g., 32 partitions → parallelism=32) - **File Source**: Set to number of files or file splits - **CPU-Intensive Transform**: Set to number of CPU cores - **I/O-Intensive Sink**: Set based on target system capacity ### 10.2 Pipeline Design **Keep Pipelines Simple**: - Prefer linear pipelines (Source → Transform → Sink) - Avoid complex branching when possible - Use multiple jobs for completely independent workflows **Use Multiple Jobs When**: - Different checkpoint intervals needed - Different resource requirements - Independent failure domains desired ### 10.3 Troubleshooting **Problem**: Tasks not starting **Check**: 1. Enough available slots? (`required_slots <= available_slots`) 2. Resource profile reasonable? (not requesting 100 CPU cores) 3. Tag filters correct? (if using tag-based assignment) **Problem**: Low throughput **Check**: 1. Parallelism too low? (increase parallelism) 2. Task fusion disabled? (enable for better performance) 3. Checkpoint interval too short? (increase interval) ## 11. Related Resources - [Engine Architecture](engine-architecture.md) - [Resource Management](resource-management.md) - [Checkpoint Mechanism](../fault-tolerance/checkpoint-mechanism.md) - [Architecture Overview](../overview.md) ## 12. References ### Key Source Files - [LogicalDag.java](../../../seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/logical/LogicalDag.java) - [PhysicalPlan.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalPlan.java) - [SubPlan.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/SubPlan.java) - [PhysicalVertex.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalVertex.java) - [TaskGroupDefaultImpl.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/group/TaskGroupDefaultImpl.java) ### Further Reading - [Google Borg Paper](https://research.google/pubs/pub43438/) - Task scheduling inspiration - [Apache Flink JobGraph](https://nightlies.apache.org/flink/flink-docs-stable/docs/internals/job_scheduling/) - [Spark DAG Scheduler](https://spark.apache.org/docs/latest/job-scheduling.html) ================================================ FILE: docs/en/architecture/engine/engine-architecture.md ================================================ --- sidebar_position: 1 title: Engine Architecture --- # SeaTunnel Engine (Zeta) Architecture ## 1. Overview ### 1.1 Problem Background Data integration engines must solve fundamental distributed systems challenges: - **Distributed Execution**: How to execute jobs across multiple machines? - **Resource Management**: How to allocate and schedule tasks efficiently? - **Fault Tolerance**: How to recover from worker/master failures? - **Coordination**: How to synchronize distributed tasks (checkpoints, commits)? - **Scalability**: How to handle increasing workloads? ### 1.2 Design Goals SeaTunnel Engine (Zeta) is designed as a native execution engine with: 1. **Lightweight**: Minimal dependencies, fast startup, low resource overhead 2. **High Performance**: Optimized for data synchronization workloads 3. **Fault Tolerance**: Checkpoint-based recovery with exactly-once semantics 4. **Resource Efficiency**: Slot-based resource management with fine-grained control 5. **Engine Independence**: Supports same connector API as Flink/Spark translations ### 1.3 Architecture Comparison | Feature | SeaTunnel Zeta | Apache Flink | Apache Spark | |---------|---------------|--------------|--------------| | **Primary Use Case** | Data sync, CDC | Stream processing | Batch + ML | | **Resource Model** | Slot-based | Slot-based | Executor-based | | **State Backend** | Pluggable (HDFS/S3/Local) | RocksDB/Heap | In-memory/Disk | | **Checkpoint** | Distributed snapshots | Chandy-Lamport | RDD lineage | | **Operational Complexity** | Lower (engine-native) | Higher | Higher | ## 2. Overall Architecture ### 2.1 Master-Worker Architecture ``` ┌─────────────────────────────────────────────────────────────────┐ │ Master Node │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ CoordinatorService │ │ │ │ • Manages all running jobs │ │ │ │ • Job submission and lifecycle management │ │ │ │ • Maintains job state (IMap) │ │ │ │ • Resource manager factory │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ JobMaster (one per job) │ │ │ │ • Generates physical execution plan │ │ │ │ • Requests resources from ResourceManager │ │ │ │ • Deploys tasks to workers │ │ │ │ • Coordinates checkpoints │ │ │ │ • Handles failover and recovery │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ │ │ (Task Deploy) │ (Resource Request) │ │ ▼ ▼ │ │ ┌─────────────────┐ ┌────────────────────────────┐ │ │ │ CheckpointManager│ │ ResourceManager │ │ │ │ (per pipeline) │ │ • Slot allocation │ │ │ └─────────────────┘ │ • Worker registration │ │ │ │ • Load balancing │ │ │ └────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ │ (Hazelcast Cluster) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Worker Nodes │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ TaskExecutionService │ │ │ │ • Deploys and executes tasks │ │ │ │ • Manages task lifecycle │ │ │ │ • Reports heartbeat │ │ │ │ • Slot resource management │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ SeaTunnelTask (multiple per worker) │ │ │ │ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ SourceFlowLifeCycle │ │ │ │ │ │ • SourceReader │ │ │ │ │ │ • SeaTunnelSourceCollector │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ │ │ │ │ │ │ ▼ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ TransformFlowLifeCycle │ │ │ │ │ │ • Transform chain │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ │ │ │ │ │ │ ▼ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ SinkFlowLifeCycle │ │ │ │ │ │ • SinkWriter │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ └───────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ ``` ### 2.2 Core Components #### CoordinatorService Centralized service managing all jobs in the cluster. **Responsibilities**: - Accept job submissions - Create JobMaster for each job - Maintain job state in distributed IMap - Provide job query and management APIs - Handle job lifecycle events **Key Data Structures**: ```java // Running job state (distributed IMap backed by Hazelcast) IMap runningJobInfoIMap; IMap runningJobStateIMap; IMap runningJobStateTimestampsIMap; // Completed job history IMap completedJobInfoIMap; ``` **Code Reference**: - [CoordinatorService.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java) #### JobMaster Manages single job execution lifecycle. **Responsibilities**: - Parse configuration → generate LogicalDag - Generate PhysicalPlan from LogicalDag - Request resources (slots) from ResourceManager - Deploy tasks to workers - Coordinate pipeline checkpoints - Handle task failures and reschedule **Lifecycle**: ``` Created → Initialized → Scheduled → Running → Finished/Failed/Canceled ``` **Key Operations**: 1. `init()`: Generate physical plan, create checkpoint coordinators 2. `run()`: Request resources, deploy tasks, start execution 3. `handleFailure()`: Restart failed tasks, restore from checkpoint **Code Reference**: - [JobMaster.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java) #### ResourceManager Manages worker resources and slot allocation. **Responsibilities**: - Track worker registration and heartbeat - Maintain worker resource profiles (CPU, memory) - Allocate slots based on strategies (random, slot ratio, load-based) - Release slots after task completion - Handle worker failures **Slot Allocation Strategies**: ```java // 1. Random: Random selection among available workers // 2. SlotRatio: Prefer workers with more available slots // 3. SystemLoad: Prefer workers with lower CPU/memory usage ``` **Code Reference**: - [ResourceManager.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/ResourceManager.java) - [AbstractResourceManager.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/AbstractResourceManager.java) ## 3. DAG Execution Model ### 3.1 Execution Plan Transformation ``` User Config (HOCON) │ ▼ ┌───────────────┐ │ LogicalDag │ • Logical vertices (Source/Transform/Sink) │ │ • Logical edges (data flow) │ │ • Parallelism (per vertex) └───────────────┘ │ (JobMaster.generatePhysicalPlan()) ▼ ┌───────────────┐ │ PhysicalPlan │ • List of SubPlan (pipelines) │ │ • JobImmutableInformation │ │ • Resource requirements └───────────────┘ │ ▼ ┌───────────────┐ │ SubPlan │ • Pipeline (independent execution unit) │ (Pipeline) │ • List of PhysicalVertex │ │ • CheckpointCoordinator └───────────────┘ │ ▼ ┌───────────────┐ │PhysicalVertex │ • TaskGroup (co-located tasks) │ │ • Assigned SlotProfile │ │ • ExecutionState └───────────────┘ │ ▼ ┌───────────────┐ │ TaskGroup │ • Multiple SeaTunnelTask instances │ │ • Shared network buffer │ │ • Thread pool └───────────────┘ │ ▼ ┌───────────────┐ │ SeaTunnelTask │ • Single task execution │ │ • Source/Transform/Sink lifecycle │ │ • Task state machine └───────────────┘ ``` ### 3.2 LogicalDag Represents user's intent in engine-independent way. ```java public class LogicalDag { private final Map logicalVertexMap; private final Set edges; private final JobConfig jobConfig; } public class LogicalVertex { private final long vertexId; private final Action action; // SourceAction / TransformChainAction / SinkAction private final int parallelism; } public class LogicalEdge { private final long inputVertexId; private final long targetVertexId; } ``` **Creation**: ```java // From user config LogicalDag logicalDag = LogicalDagBuilder.build(jobConfig); ``` ### 3.3 PhysicalPlan Represents actual execution plan with resource allocation. ```java public class PhysicalPlan { private final List pipelineList; private final JobImmutableInformation jobImmutableInformation; private final CompletableFuture jobEndFuture; } public class SubPlan { private final int pipelineId; private final List physicalVertexList; private final List coordinatorVertexList; private final CheckpointCoordinator checkpointCoordinator; } public class PhysicalVertex { private final TaskGroupLocation taskGroupLocation; private final TaskGroupDefaultImpl taskGroup; private final SlotProfile slotProfile; // Assigned slot private final ExecutionState currentExecutionState; } ``` **Generation**: ```java PhysicalPlan physicalPlan = jobMaster.getPhysicalPlan(); // JobMaster internally: // 1. Split LogicalDag into pipelines // 2. Generate PhysicalVertex for each parallel instance // 3. Create CheckpointCoordinator per pipeline ``` ### 3.4 Pipeline Execution Jobs are divided into **Pipelines** (SubPlans) for independent execution: **Example**: ```hocon # Config with multiple sources/sinks env { ... } source { MySQL-CDC { table = "orders" } Kafka { topic = "events" } } transform { Sql { query = "SELECT * FROM orders JOIN events ON ..." } } sink { Elasticsearch { index = "orders" } JDBC { table = "events" } } ``` **Generated Pipelines**: ``` Pipeline 1: MySQL-CDC → Transform → Elasticsearch Pipeline 2: Kafka → Transform → JDBC ``` **Benefits**: - Independent checkpoint coordination - Isolated failure domains - Parallel pipeline execution ### 3.5 Task Fusion Multiple actions can be fused into single TaskGroup for efficiency: ``` Without Fusion: [Source Task] → Network → [Transform Task] → Network → [Sink Task] With Fusion: [TaskGroup: Source → Transform → Sink] (single thread, no network) ``` **Fusion Conditions**: - Same parallelism - Sequential dependency - No shuffle required ## 4. Task Lifecycle ### 4.1 Task State Machine ``` [Created] │ ▼ [INIT] ────────────────────────────────────┐ │ │ ▼ │ [WAITING_RESTORE] (if recovering) │ │ │ ▼ │ [READY_START] │ │ │ ▼ │ [STARTING] ──────────────┐ │ │ │ │ ▼ ▼ ▼ [RUNNING] ──────────> [FAILED] ─────> (Restart) │ ▼ [PREPARE_CLOSE] │ ▼ [CLOSED] │ ▼ [CANCELED] (if job canceled) ``` **State Transitions**: 1. **CREATED → INIT**: Task created, initializing resources 2. **INIT → WAITING_RESTORE**: Recovering from checkpoint 3. **WAITING_RESTORE → READY_START**: State restored 4. **READY_START → STARTING**: Opening Source/Transform/Sink 5. **STARTING → RUNNING**: Data processing started 6. **RUNNING → PREPARE_CLOSE**: Normal completion 7. **PREPARE_CLOSE → CLOSED**: Resources cleaned up 8. **RUNNING → FAILED**: Exception occurred ### 4.2 SeaTunnelTask Execution ```java public abstract class SeaTunnelTask implements Runnable { private final TaskLocation taskLocation; private final TaskExecutionContext executionContext; private ExecutionState executionState; @Override public void run() { try { init(); restoreState(); // If recovering open(); while (isRunning()) { processData(); // Source: read, Transform: process, Sink: write handleBarrier(); // Checkpoint barriers } close(); } catch (Exception e) { handleException(e); } } } ``` **Task Types**: - **SourceSeaTunnelTask**: Runs SourceReader, emits data - **SinkSeaTunnelTask**: Runs SinkWriter, consumes data - **TransformSeaTunnelTask**: Runs Transform chain ### 4.3 FlowLifeCycle Management Each task manages component lifecycle through FlowLifeCycle: ```java // Source task public class SourceFlowLifeCycle implements FlowLifeCycle { private final SourceReader sourceReader; private final SeaTunnelSourceCollector collector; @Override public void open() { sourceReader.open(); } @Override public void collect() { sourceReader.pollNext(collector); // Read data } @Override public void close() { sourceReader.close(); } } // Sink task public class SinkFlowLifeCycle implements FlowLifeCycle { private final SinkWriter sinkWriter; @Override public void collect() { T record = inputQueue.poll(); sinkWriter.write(record); // Write data } } ``` ## 5. Checkpoint Coordination ### 5.1 CheckpointCoordinator (per Pipeline) Each pipeline has independent checkpoint coordinator. **Responsibilities**: - Trigger checkpoint periodically - Inject checkpoint barriers into data flow - Collect task acknowledgements - Persist completed checkpoints - Clean up old checkpoints **Key Data Structures**: ```java public class CheckpointCoordinator { private final CheckpointIDCounter checkpointIdCounter; private final Map pendingCheckpoints; private final ArrayDeque completedCheckpointIds; private final CheckpointStorage checkpointStorage; } ``` **Checkpoint Flow**: 1. Coordinator triggers checkpoint (periodic or manual) 2. Send barriers to all source tasks in pipeline 3. Barriers propagate through data flow 4. Each task snapshots state upon receiving barrier 5. Tasks send ACK back to coordinator 6. Coordinator waits for all ACKs 7. Create CompletedCheckpoint, persist to storage **Code Reference**: - [CheckpointCoordinator.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java) ### 5.2 Checkpoint Barrier Special control message that flows with data: ```java public class Barrier { private final long checkpointId; private final long timestamp; private final CheckpointType type; // CHECKPOINT or SAVEPOINT } ``` **Barrier Alignment**: - Tasks with multiple inputs wait for barrier from ALL inputs before snapshotting - Ensures consistent snapshot across distributed tasks ## 6. Resource Management ### 6.1 Slot Model **SlotProfile**: ```java public class SlotProfile { private final int slotID; private final Address worker; private final ResourceProfile resourceProfile; // CPU, memory } public class ResourceProfile { private final CPU cpu; private final Memory heapMemory; } ``` **WorkerProfile**: ```java public class WorkerProfile { private final Address address; private final ResourceProfile profile; private final ResourceProfile unassignedResource; private final SlotProfile[] assignedSlots; private final SlotProfile[] unassignedSlots; private final Map attributes; } ``` ### 6.2 Resource Allocation Flow ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as Worker Node JM->>RM: applyResources(jobId, resourceProfiles) RM->>RM: Select workers (strategy) RM->>RM: Allocate slots RM->>JM: Return slot profiles JM->>Worker: Deploy task (DeployTaskOperation) Worker->>Worker: Create SeaTunnelTask Worker->>JM: ACK JM->>JM: Task running ``` ### 6.3 Tag-Based Slot Filtering Assign tasks to specific worker groups: ```hocon env { # Job-level worker attribute filter (key/value full match) tag_filter = { zone = "db-zone" } } ``` **Usage**: - Data locality (assign to workers near data source) - Resource isolation (GPU workers for ML transforms) - Multi-tenancy (different teams use different worker pools) ## 7. Failure Handling ### 7.1 Task Failure **Detection**: - Task reports exception to JobMaster - JobMaster monitors task heartbeat - Timeout triggers failure detection **Recovery**: 1. Mark task as FAILED 2. Release task's slot 3. Retrieve latest successful checkpoint 4. Restart task with restored state 5. Reassign splits (for Source tasks) ### 7.2 Worker Failure **Detection**: - ResourceManager monitors worker heartbeat - Hazelcast cluster detects member removal **Recovery**: 1. Mark all tasks on failed worker as FAILED 2. Trigger job failover 3. Restore from latest checkpoint 4. Reallocate slots on healthy workers 5. Redeploy tasks ### 7.3 Master Failure **High Availability**: - Multiple master nodes (Hazelcast cluster) - Job state stored in distributed IMap (replicated) - New master takes over from IMap state **Recovery**: 1. Detect master failure (Hazelcast) 2. Elect new master 3. New master reads job state from IMap 4. Reconnect to workers 5. Resume checkpoint coordination ## 8. Design Considerations ### 8.1 Why Pipeline-based Execution? **Alternative**: Single global DAG execution **Decision**: Divide into pipelines **Benefits**: - Independent checkpoint coordination (less coordination overhead) - Clear failure boundaries (one pipeline fails, others continue) - Easier to reason about data flow - Support complex DAGs (multiple sources/sinks) **Drawbacks**: - Cannot fuse tasks across pipeline boundaries - Potential data serialization between pipelines ### 8.2 Why Hazelcast for Coordination? **Alternative**: Zookeeper, etcd, custom Raft implementation **Decision**: Hazelcast IMDG **Benefits**: - In-memory distributed data structures (low latency) - Built-in cluster management and failure detection - Easy to embed (no external dependencies) - Familiar API (Java Collections) **Drawbacks**: - Memory overhead for large state - Less battle-tested than Zookeeper for coordination ### 8.3 Performance Optimizations **1. Task Fusion**: - Reduce network overhead - Improve CPU cache locality - Lower serialization cost **2. Async Checkpoint**: - Checkpoint upload doesn't block data processing - Parallel checkpoint across tasks **3. Incremental Checkpoint**: - Only upload changed state (future enhancement) **4. Zero-Copy Data Transfer**: - Shared memory between co-located tasks - Avoid unnecessary serialization ## 9. Related Resources - [Architecture Overview](../overview.md) - [Design Philosophy](../design-philosophy.md) - [Checkpoint Mechanism](../fault-tolerance/checkpoint-mechanism.md) - [Resource Management](resource-management.md) - [DAG Execution](dag-execution.md) ## 10. References ### Key Source Files - Engine Core: `seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/` - DAG: `seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/` - Checkpoint: `seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/` ### Further Reading - [Hazelcast IMDG](https://docs.hazelcast.com/imdg/latest/) - [Google Borg Paper](https://research.google/pubs/pub43438/) - Inspiration for resource management - [Apache Flink Architecture](https://flink.apache.org/flink-architecture.html) ================================================ FILE: docs/en/architecture/engine/resource-management.md ================================================ --- sidebar_position: 3 title: Resource Management --- # Resource Management ## 1. Overview ### 1.1 Problem Background Distributed execution engines must efficiently manage computing resources: - **Resource Allocation**: How to assign tasks to workers fairly and efficiently? - **Load Balancing**: How to distribute workload evenly across workers? - **Resource Isolation**: How to prevent resource contention between jobs? - **Dynamic Scaling**: How to add/remove workers without disrupting jobs? - **Heterogeneous Resources**: How to handle workers with different capabilities? ### 1.2 Design Goals SeaTunnel's resource management system aims to: 1. **Fine-Grained Control**: Slot-based allocation for precise resource management 2. **Flexible Strategies**: Multiple allocation strategies for different scenarios 3. **Tag-Based Filtering**: Assign tasks to specific worker groups 4. **High Availability**: Tolerate worker failures with automatic reassignment 5. **Observability**: Track resource usage and availability in real-time ### 1.3 Architecture Overview ``` ┌──────────────────────────────────────────────────────────────┐ │ JobMaster │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ Request Resources │ │ │ │ • Calculate required slots │ │ │ │ • Specify resource profiles (CPU, memory) │ │ │ │ • Apply tag filters (optional) │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ ResourceManager │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ Worker Registry │ │ │ │ • WorkerProfile (per worker) │ │ │ │ - Total resources │ │ │ │ - Available resources │ │ │ │ - Assigned slots │ │ │ │ - Unassigned slots │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ Allocation Strategies │ │ │ │ • RandomStrategy / SlotRatioStrategy / SystemLoadStrategy │ │ └────────────────────────────────────────────────────┘ │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ Slot Management │ │ │ │ • Allocate slots │ │ │ │ • Release slots │ │ │ │ • Track slot usage │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ Worker Nodes │ │ │ │ Worker 1 Worker 2 Worker N │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ │ Slot 1 │ │ Slot 1 │ │ Slot 1 │ │ │ │ Slot 2 │ │ Slot 2 │ │ Slot 2 │ │ │ │ ... │ │ ... │ │ ... │ │ │ └──────────┘ └──────────┘ └──────────┘ │ └──────────────────────────────────────────────────────────────┘ ``` ## 2. Core Concepts ### 2.1 Slot A **Slot** is the fundamental unit of resource allocation. ```java public class SlotProfile { // Unique slot identifier private final int slotID; // Worker address where this slot resides private final Address worker; // Resource capacity of this slot private final ResourceProfile resourceProfile; } ``` **Key Properties**: - **Granular**: Each slot can host one or more tasks (task fusion) - **Typed**: Slots have resource profiles (CPU, memory) - **Stateful**: Slots track assignment status (assigned/unassigned) **Example**: ```java SlotProfile slot = new SlotProfile( new Address("worker-1", 5801), 1001, new ResourceProfile(CPU.of(1), Memory.of(512 * 1024 * 1024L)), "seq-1" ); ``` ### 2.2 ResourceProfile Describes resource requirements or capacity. ```java public class ResourceProfile { private final CPU cpu; private final Memory heapMemory; } public class CPU { private final int core; // Number of CPU cores } public class Memory { private final long bytes; // Heap memory in bytes } ``` **Usage**: - **Task Requirements**: JobMaster specifies required resources per task - **Slot Capacity**: Each slot advertises its available resources - **Matching**: ResourceManager matches task requirements to slot capacity ### 2.3 WorkerProfile Represents a worker node's resources and slot inventory. ```java public class WorkerProfile { // Worker address private final Address address; // Total resources (all slots combined) private final ResourceProfile profile; // Currently available resources private final ResourceProfile unassignedResource; // Slots assigned to jobs private final SlotProfile[] assignedSlots; // Slots available for assignment private final SlotProfile[] unassignedSlots; // Worker attributes (used by job-level tag_filter) private final Map attributes; // Optional system load info (for SystemLoadStrategy) private final SystemLoadInfo systemLoadInfo; } ``` **Lifecycle**: 1. **Registration**: Worker registers with ResourceManager on startup 2. **Heartbeat**: Worker sends periodic heartbeats with updated resource info 3. **Allocation**: ResourceManager assigns slots from unassigned pool 4. **Release**: Completed tasks free slots, moving them back to unassigned pool 5. **Deregistration**: Worker leaves cluster (graceful or failure) ## 3. Resource Manager ### 3.1 Interface ```java public interface ResourceManager { /** * Apply for resources (called by JobMaster) */ CompletableFuture> applyResources( long jobId, List resourceProfiles, Map tagFilter ) throws NoEnoughResourceException; /** * Release resources (called by JobMaster after task completion) */ CompletableFuture releaseResources(long jobId, List slots); /** * Worker heartbeat (called by TaskExecutionService) */ void heartbeat(WorkerProfile workerProfile); /** * Handle worker removal (failure or graceful shutdown) */ void memberRemoved(MembershipServiceEvent event); } ``` ### 3.2 Implementation: AbstractResourceManager ```java public abstract class AbstractResourceManager implements ResourceManager { // Registered workers protected final ConcurrentMap registerWorker; // Worker selection strategy (RandomStrategy / SlotRatioStrategy / SystemLoadStrategy) protected final SlotAllocationStrategy slotAllocationStrategy; @Override public CompletableFuture> applyResources( long jobId, List resourceProfiles, Map tagFilter ) throws NoEnoughResourceException { // 1. Filter workers by tagFilter (match worker attributes) Map candidates = filterWorkerByTag(tagFilter); // 2. For each requested profile, select a worker by strategy and pick an unassigned slot // (actual slot selection/marking is implementation-defined) return requestSlots(jobId, resourceProfiles, candidates, slotAllocationStrategy); } } ``` ## 4. Slot Allocation Strategies In SeaTunnel Engine / Zeta, allocation typically consists of: 1. Select a candidate worker (strategy) 2. Pick an unassigned slot from that worker ### 4.1 RandomStrategy Randomly selects a worker from the available candidates. ```java public class RandomStrategy implements SlotAllocationStrategy { @Override public Optional selectWorker(List availableWorkers) { Collections.shuffle(availableWorkers); return availableWorkers.stream().findFirst(); } } ``` ### 4.2 SlotRatioStrategy Selects the worker with the lowest slot usage ratio (prefers workers with more available slots). ### 4.3 SystemLoadStrategy Selects the worker with the lowest system load (based on heartbeat-reported load information). ## 5. Tag-Based Slot Filtering ### 5.1 Use Cases **Data Locality**: ```hocon env { # Job-level worker attribute filter (full key/value match) tag_filter = { zone = "us-west-1" } } ``` **Resource Specialization**: ```hocon env { tag_filter = { resource = "gpu" } } ``` **Multi-Tenancy**: ```hocon env { job.name = "tenant-a-job" tag_filter = { tenant = "a" } } ``` ### 5.2 Matching Semantics The engine matches `env.tag_filter` against worker `attributes` (key/value full match). If no worker matches, resource allocation fails. ## 6. Resource Allocation Flow ### 6.1 Normal Allocation ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as Worker Node JM->>JM: Generate PhysicalPlan JM->>JM: Calculate required resources JM->>RM: applyResources(profiles, tags) RM->>RM: Filter workers by tags RM->>RM: Select workers by strategy RM->>RM: Allocate slots RM-->>JM: Return SlotProfiles JM->>JM: Assign slots to PhysicalVertices loop For each task JM->>Worker: DeployTaskOperation(task, slot) Worker->>Worker: Execute task in slot Worker-->>JM: ACK end ``` ### 6.2 Insufficient Resources ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager JM->>RM: applyResources(100 slots) RM->>RM: Check available slots Note over RM: Only 50 slots available RM-->>JM: NoEnoughResourceException JM->>JM: Retry with backoff Note over JM: Wait for resources to free up JM->>RM: applyResources(100 slots) RM-->>JM: Success (after resources freed) ``` ### 6.3 Resource Release ```mermaid sequenceDiagram participant Task as SeaTunnelTask participant JM as JobMaster participant RM as ResourceManager Task->>Task: Task completes/fails Task->>JM: Task finished JM->>RM: releaseResources(slots) RM->>RM: Mark slots as unassigned RM->>RM: Update WorkerProfile Note over RM: Slots available for
new allocations ``` ## 7. Failure Handling ### 7.1 Worker Failure **Detection**: - Heartbeat timeout (default: 60 seconds) - Hazelcast member removed event **Recovery**: ```java @Override public void memberRemoved(MembershipEvent event) { Address failedWorker = event.getMember().getAddress(); // 1. Remove worker from registry WorkerProfile failed = registerWorker.remove(failedWorker); // 2. Notify JobMasters of slot losses List lostSlots = failed.getAssignedSlots(); for (SlotProfile slot : lostSlots) { long jobId = getJobIdForSlot(slot); JobMaster jobMaster = getJobMaster(jobId); // 3. Trigger job failover jobMaster.notifySlotLost(slot); } } ``` **JobMaster Response**: 1. Mark tasks on failed slots as FAILED 2. Restore from latest checkpoint 3. Request new slots from ResourceManager 4. Redeploy tasks ### 7.2 ResourceManager Failure **High Availability**: - ResourceManager state is stateless (worker registry rebuilt from heartbeats) - New ResourceManager instance starts on master failover - Workers re-register via heartbeat mechanism **Recovery**: - Worker liveness is determined by heartbeat updates and cluster membership events (exact timeout/threshold is implementation/config-dependent) ## 8. Configuration ### 8.1 Slot Configuration Example (`config/seatunnel.yaml`, SeaTunnel Engine / Zeta): ```yaml seatunnel: engine: slot-service: dynamic-slot: true slot-num: 16 slot-allocate-strategy: RANDOM # RANDOM / SLOT_RATIO / SYSTEM_LOAD ``` ## 9. Monitoring and Metrics ### 9.1 Key Metrics **Cluster-Level**: - Worker count and liveness (registered vs active) - Slot inventory and utilization (assigned vs unassigned) **Per-Worker**: - CPU/memory utilization (if reported) - Slots assigned/unassigned **Per-Job**: - Slots requested/allocated - Resource wait time (if available) ### 9.2 Observability **Resource Dashboard Example**: ``` Cluster Resources: Workers: 10 (all healthy) Total Slots: 20 Available Slots: 8 Utilization: 60% Top Resource Consumers: job-123: 6 slots (mysql-cdc → elasticsearch) job-456: 4 slots (kafka → jdbc) job-789: 2 slots (file → s3) Worker Distribution: worker-1: 2/2 slots (100%) worker-2: 1/2 slots (50%) worker-3: 2/2 slots (100%) ... ``` ## 10. Best Practices ### 10.1 Slot Sizing Slot sizing (slots per worker, heap per slot, etc.) depends on workload characteristics and deployment constraints. Avoid treating formulas in architecture docs as mandatory defaults. ### 10.2 Strategy Selection **Use RandomStrategy when**: - Homogeneous cluster (all workers identical) - Simple deployments - Fast allocation more important than perfect balance **Use SlotRatioStrategy when**: - Need good load balancing - Mixed job sizes - Moderate cluster size (< 100 workers) **Use SystemLoadStrategy when**: - Heterogeneous cluster - Workers have varying CPU/memory - Optimizing resource utilization is critical ### 10.3 Tag Usage **Data Locality**: ```hocon env { # Match worker attributes, e.g., zone=us-west-1a tag_filter = { zone = "us-west-1a" } } ``` **Resource Isolation**: ```hocon env { job.name = "critical-job" tag_filter = { priority = "high" } } ``` ## 11. Related Resources - [Engine Architecture](engine-architecture.md) - [DAG Execution](dag-execution.md) - [Architecture Overview](../overview.md) ## 12. References ### Key Source Files - [ResourceManager.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/ResourceManager.java) - [AbstractResourceManager.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/AbstractResourceManager.java) - [SlotProfile.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/resource/SlotProfile.java) - [WorkerProfile.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/worker/WorkerProfile.java) ### Further Reading - [Google Borg](https://research.google/pubs/pub43438/) - Large-scale cluster management - [Apache YARN](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) - Resource management in Hadoop - [Kubernetes](https://kubernetes.io/docs/concepts/scheduling-eviction/kube-scheduler/) - Container orchestration and scheduling ================================================ FILE: docs/en/architecture/fault-tolerance/checkpoint-mechanism.md ================================================ --- sidebar_position: 1 title: Checkpoint Mechanism --- # Checkpoint Mechanism ## 1. Overview ### 1.1 Problem Background Distributed data processing systems face critical challenges for fault tolerance: - **State Loss**: How to preserve processing state across failures? - **Exactly-Once**: How to ensure each record is processed exactly once? - **Distributed Consistency**: How to create consistent snapshots across distributed tasks? - **Performance**: How to checkpoint without blocking data processing? - **Recovery**: How to efficiently restore state after failures? ### 1.2 Design Goals SeaTunnel's checkpoint mechanism aims to: 1. **Guarantee Exactly-Once Semantics**: Consistent state snapshots + two-phase commit 2. **Minimize Overhead**: Asynchronous checkpoint, no data processing blocking 3. **Fast Recovery**: Restore from latest checkpoint in seconds 4. **Distributed Coordination**: Coordinate checkpoints across hundreds of tasks 5. **Pluggable Storage**: Support multiple storage backends (HDFS, S3, Local, OSS) ### 1.3 Theoretical Foundation SeaTunnel's checkpoint is based on the **Chandy-Lamport distributed snapshot algorithm**: **Key Idea**: Insert special markers (barriers) into data streams. When a task receives barrier: 1. Snapshot its local state 2. Forward barrier downstream 3. Continue processing Result: Globally consistent snapshot without pausing entire system. **Reference**: ["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf) (Chandy & Lamport, 1985) ## 2. Architecture Design ### 2.1 Checkpoint Architecture ``` ┌─────────────────────────────────────────────────────────────────┐ │ JobMaster (per job) │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ CheckpointCoordinator (per pipeline) │ │ │ │ │ │ │ │ • Trigger checkpoint (periodic/manual) │ │ │ │ • Generate checkpoint ID │ │ │ │ • Track pending checkpoints │ │ │ │ • Collect task acknowledgements │ │ │ │ • Persist completed checkpoints │ │ │ │ • Cleanup old checkpoints │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ │ (Trigger Barrier) │ │ ▼ │ └─────────────────────────────────────────────────────────────────┘ │ │ (CheckpointBarrier) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Worker Nodes │ │ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ SourceTask 1 │ │ SourceTask 2 │ │ SourceTask N │ │ │ │ │ │ │ │ │ │ │ │ 1. Receive │ │ 1. Receive │ │ 1. Receive │ │ │ │ Barrier │ │ Barrier │ │ Barrier │ │ │ │ 2. Snapshot │ │ 2. Snapshot │ │ 2. Snapshot │ │ │ │ State │ │ State │ │ State │ │ │ │ 3. ACK │ │ 3. ACK │ │ 3. ACK │ │ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ │ │ │ │ (Barrier Propagation) │ │ │ ▼ ▼ ▼ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ Transform 1 │ │ Transform 2 │ │ Transform N │ │ │ │ │ │ │ │ │ │ │ │ 1. Receive │ │ 1. Receive │ │ 1. Receive │ │ │ │ Barrier │ │ Barrier │ │ Barrier │ │ │ │ 2. Snapshot │ │ 2. Snapshot │ │ 2. Snapshot │ │ │ │ State │ │ State │ │ State │ │ │ │ 3. ACK │ │ 3. ACK │ │ 3. ACK │ │ │ │ 4. Forward │ │ 4. Forward │ │ 4. Forward │ │ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ │ │ │ ▼ ▼ ▼ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ SinkTask 1 │ │ SinkTask 2 │ │ SinkTask N │ │ │ │ │ │ │ │ │ │ │ │ 1. Receive │ │ 1. Receive │ │ 1. Receive │ │ │ │ Barrier │ │ Barrier │ │ Barrier │ │ │ │ 2. Prepare │ │ 2. Prepare │ │ 2. Prepare │ │ │ │ Commit │ │ Commit │ │ Commit │ │ │ │ 3. Snapshot │ │ 3. Snapshot │ │ 3. Snapshot │ │ │ │ State │ │ State │ │ State │ │ │ │ 4. ACK │ │ 4. ACK │ │ 4. ACK │ │ │ └──────────────┘ └──────────────┘ └──────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ │ (All ACKs received) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ CheckpointStorage │ │ (HDFS / S3 / Local / OSS) │ │ │ │ CompletedCheckpoint { │ │ checkpointId: 123 │ │ taskStates: { │ │ SourceTask-1: { splits: [...], offsets: [...] } │ │ SinkTask-1: { commitInfo: XidInfo(...) } │ │ ... │ │ } │ │ } │ └─────────────────────────────────────────────────────────────────┘ ``` ### 2.2 Key Data Structures #### CheckpointCoordinator ```java public class CheckpointCoordinator { // Checkpoint ID generator private final CheckpointIDCounter checkpointIdCounter; // Checkpoint execution plan private final CheckpointPlan checkpointPlan; // Pending checkpoints (in progress) private final Map pendingCheckpoints; // Completed checkpoints (success) private final ArrayDeque completedCheckpointIds; // Latest completed checkpoint private CompletedCheckpoint latestCompletedCheckpoint; // Checkpoint storage private final CheckpointStorage checkpointStorage; // Configuration private final long checkpointInterval; // Trigger interval (ms) private final long checkpointTimeout; // Timeout (ms) private final int minPauseBetweenCheckpoints; // Min pause (ms) } ``` #### PendingCheckpoint Represents in-progress checkpoint. ```java public class PendingCheckpoint { private final long checkpointId; private final CheckpointType checkpointType; // CHECKPOINT or SAVEPOINT private final long triggerTimestamp; // Tasks that haven't acknowledged yet private final Set notYetAcknowledgedTasks; // Collected action states (from task ACKs) private final Map actionStates; // Task statistics (records processed, bytes, etc.) private final Map taskStatistics; // Future completed when all tasks ACK private final CompletableFuture completableFuture; /** * Called when task acknowledges checkpoint */ public void acknowledgeTask(long taskId, List states, TaskStatistics statistics) { notYetAcknowledgedTasks.remove(taskId); // Collect states for (ActionSubtaskState state : states) { actionStates.computeIfAbsent(state.getKey(), k -> new ActionState()) .putSubtaskState(state); } // Collect statistics taskStatistics.put(taskId, statistics); // Check if all tasks acknowledged if (notYetAcknowledgedTasks.isEmpty()) { completeCheckpoint(); } } private void completeCheckpoint() { CompletedCheckpoint completed = new CompletedCheckpoint( checkpointId, actionStates, taskStatistics, System.currentTimeMillis() ); completableFuture.complete(completed); } } ``` #### CompletedCheckpoint Persisted checkpoint data. ```java public class CompletedCheckpoint implements Serializable { private final long checkpointId; private final Map taskStates; private final Map taskStatistics; private final long completedTimestamp; } public class ActionState implements Serializable { private final ActionStateKey key; // (pipelineId, actionId) private final Map subtaskStates; } public class ActionSubtaskState implements Serializable { private final int subtaskIndex; private final byte[] state; // Serialized state } ``` ### 2.3 CheckpointStorage Abstraction for checkpoint persistence. ```java public interface CheckpointStorage { /** * Store completed checkpoint */ void storeCheckpoint(CompletedCheckpoint checkpoint) throws IOException; /** * Get latest checkpoint */ Optional getLatestCheckpoint() throws IOException; /** * Get specific checkpoint by ID */ Optional getCheckpoint(long checkpointId) throws IOException; /** * Delete old checkpoint */ void deleteCheckpoint(long checkpointId) throws IOException; } ``` **Implementations**: - `LocalFileStorage`: Local file system (testing) - `HdfsStorage`: Hadoop FileSystem-based backend; can work with HDFS/S3A/etc depending on Hadoop configuration Note: S3 and OSS support are provided through Hadoop FileSystem configuration (e.g., `fs.s3a.impl`) rather than separate CheckpointStorage implementations. ## 3. Checkpoint Flow ### 3.1 Trigger Checkpoint ```mermaid sequenceDiagram participant Timer as Periodic Timer participant Coord as CheckpointCoordinator participant Plan as CheckpointPlan Timer->>Coord: Trigger (every 60s) Coord->>Coord: Generate checkpointId (123) Coord->>Coord: Check conditions Note over Coord: • Min pause elapsed?
• Max concurrent not exceeded?
• Previous checkpoint complete? Coord->>Coord: Create PendingCheckpoint(123) Coord->>Plan: Get starting tasks loop For each starting task Coord->>Task: Send CheckpointBarrierTriggerOperation(123) end Coord->>Coord: Start timeout timer (10 minutes) ``` **Trigger Conditions**: 1. Checkpoint interval elapsed (e.g., 60 seconds) 2. Minimum pause between checkpoints elapsed (e.g., 10 seconds) 3. Number of concurrent checkpoints < max (e.g., 1) 4. No checkpoint in progress (for single concurrent) ### 3.2 Barrier Propagation ```mermaid sequenceDiagram participant Coord as Coordinator participant Source as SourceTask participant Transform as TransformTask participant Sink as SinkTask Coord->>Source: Trigger barrier(123) Source->>Source: Receive barrier Source->>Source: snapshotState() → splits, offsets Source->>Coord: ACK(state) Source->>Transform: Forward barrier(123) Transform->>Transform: Receive barrier Transform->>Transform: snapshotState() → transform state Transform->>Coord: ACK(state) Transform->>Sink: Forward barrier(123) Sink->>Sink: Receive barrier Sink->>Sink: prepareCommit(checkpointId) → commitInfo Sink->>Sink: snapshotState() → writer state Sink->>Coord: ACK(commitInfo + state) Coord->>Coord: All ACKs received Coord->>Coord: Create CompletedCheckpoint ``` **Barrier Flow Rules**: 1. **Source Tasks**: Start of pipeline, receive barrier from coordinator 2. **Transform Tasks**: Receive from upstream, snapshot, forward downstream 3. **Sink Tasks**: End of pipeline, receive from upstream, snapshot, no forward **Barrier Alignment** (for tasks with multiple inputs): ```java // Task with 2 inputs Input 1: ──data──data──[barrier-123]──data──data── │ Wait! Input 2: ──data──data──data──data──[barrier-123]── │ ▼ Both barriers received, snapshot state ``` ### 3.3 State Snapshot Each task type snapshots different state: **SourceTask**: ```java @Override public void triggerBarrier(long checkpointId) { // 1. Snapshot SourceReader state (splits + offsets) List states = sourceFlowLifeCycle.snapshotState(checkpointId); // 2. Create ActionSubtaskState ActionSubtaskState state = new ActionSubtaskState(subtaskIndex, states); // 3. Send ACK to coordinator sendAcknowledgement(checkpointId, Collections.singletonList(state)); // 4. Forward barrier downstream forwardBarrierToDownstream(checkpointId); } ``` **TransformTask**: ```java @Override public void triggerBarrier(long checkpointId) { // 1. Snapshot Transform state (usually stateless, empty state) List states = transformFlowLifeCycle.snapshotState(checkpointId); // 2. Create ActionSubtaskState ActionSubtaskState state = new ActionSubtaskState(subtaskIndex, states); // 3. Send ACK sendAcknowledgement(checkpointId, Collections.singletonList(state)); // 4. Forward barrier forwardBarrierToDownstream(checkpointId); } ``` **SinkTask**: ```java @Override public void triggerBarrier(long checkpointId) { // 1. Prepare commit (TWO-PHASE COMMIT) Optional commitInfo = sinkWriter.prepareCommit(checkpointId); // 2. Snapshot writer state List writerStates = sinkWriter.snapshotState(checkpointId); // 3. Create ActionSubtaskState (includes both commit info and state) ActionSubtaskState state = new ActionSubtaskState( subtaskIndex, serialize(writerStates), commitInfo.orElse(null) ); // 4. Send ACK (NO forwarding - end of pipeline) sendAcknowledgement(checkpointId, Collections.singletonList(state)); } ``` ### 3.4 Checkpoint Completion ```mermaid sequenceDiagram participant Coord as CheckpointCoordinator participant Pending as PendingCheckpoint participant Storage as CheckpointStorage participant Committer as SinkCommitter participant Tasks as All Tasks Pending->>Pending: All tasks ACKed Pending->>Coord: notifyCheckpointComplete() Coord->>Coord: Create CompletedCheckpoint Coord->>Storage: Persist checkpoint Storage-->>Coord: Success Coord->>Committer: commit(commitInfos) Committer-->>Coord: Success Coord->>Tasks: notifyCheckpointComplete(123) Tasks->>Tasks: Cleanup resources Coord->>Storage: Delete old checkpoints ``` **Completion Steps**: 1. All tasks acknowledged 2. Create `CompletedCheckpoint` from `PendingCheckpoint` 3. Persist checkpoint to storage 4. Trigger sink commit (two-phase commit) 5. Notify all tasks of completion 6. Cleanup old checkpoints (retain last N) ### 3.5 Checkpoint Timeout ```java // CheckpointCoordinator private void startCheckpointTimeout(long checkpointId, long timeoutMs) { scheduledExecutor.schedule(() -> { PendingCheckpoint pending = pendingCheckpoints.get(checkpointId); if (pending != null && !pending.isCompleted()) { LOG.warn("Checkpoint {} timeout after {}ms, {} tasks not yet acknowledged", checkpointId, timeoutMs, pending.getNotYetAcknowledgedTasks()); // Fail checkpoint pending.abort(); pendingCheckpoints.remove(checkpointId); // Trigger job failover if needed handleCheckpointFailure(checkpointId); } }, timeoutMs, TimeUnit.MILLISECONDS); } ``` **Timeout Handling**: - Default timeout: 10 minutes - If timeout, checkpoint fails - Job continues with previous checkpoint - Next checkpoint will be triggered per schedule ## 4. Recovery Process ### 4.1 Restore from Checkpoint ```mermaid sequenceDiagram participant JM as JobMaster participant Storage as CheckpointStorage participant Source as SourceTask participant Sink as SinkTask JM->>Storage: getLatestCheckpoint() Storage-->>JM: CompletedCheckpoint(123) JM->>JM: Extract states per task JM->>Source: Deploy with NotifyTaskRestoreOperation activate Source Source->>Source: restoreState(splits, offsets) Source->>Source: Seek to checkpointed offset Source-->>JM: Ready deactivate Source JM->>Sink: Deploy with NotifyTaskRestoreOperation activate Sink Sink->>Sink: restoreWriter(writerState) Sink->>Sink: Restore uncommitted transactions Sink-->>JM: Ready deactivate Sink JM->>Source: Start execution JM->>Sink: Start execution ``` **Restore Steps**: 1. JobMaster retrieves latest `CompletedCheckpoint` from storage 2. Extract state for each task (by ActionStateKey and subtaskIndex) 3. Deploy tasks with `NotifyTaskRestoreOperation` containing state 4. Tasks restore state: - **SourceReader**: Restore splits and offsets, seek to position - **Transform**: Restore transform state (usually none) - **SinkWriter**: Restore writer state, may have uncommitted transactions 5. Tasks transition to READY_START state 6. Job resumes execution **Example: JDBC Source Recovery**: ```java public class JdbcSourceReader { @Override public void restoreState(List states) { for (JdbcSourceState state : states) { JdbcSourceSplit split = state.getSplit(); long offset = state.getCurrentOffset(); // Restore split with offset pendingSplits.add(split); // When processing split, start from offset String query = split.getQuery() + " OFFSET " + offset; } } } ``` ### 4.2 Exactly-Once Recovery Combination of checkpoint restore + sink two-phase commit ensures exactly-once: ``` Checkpoint N (completed): Source offsets: [100, 200, 300] Sink prepared commits: [XID-1, XID-2, XID-3] Sink committer commits XID-1, XID-2, XID-3 ↓ [Failure] Recovery from Checkpoint N: 1. Restore source offsets: [100, 200, 300] 2. Sources start reading from offset 100, 200, 300 3. Sink writers restore state (may have uncommitted XIDs) 4. Sink committer retries committing XIDs (idempotent) Result: Records 0-99, 100-199, 200-299 committed exactly once Records from 100+ reprocessed but not duplicated (idempotent commit) ``` ## 5. Configuration and Tuning ### 5.1 Checkpoint Configuration ```hocon env { # Enable checkpoint checkpoint.interval = 60000 # Trigger every 60 seconds # Checkpoint timeout checkpoint.timeout = 600000 # 10 minutes # Min pause between checkpoints min-pause = 10000 # 10 seconds } ``` Checkpoint storage is configured on the engine side (e.g., `config/seatunnel.yaml` under `seatunnel.engine.checkpoint.storage`), rather than as job-level `env` options. ### 5.2 Tuning Guidelines **Checkpoint Interval**: - **Shorter interval**: Faster recovery, higher overhead - **Longer interval**: Lower overhead, slower recovery **Trade-offs**: - Shorter interval → More frequent I/O → Higher storage cost - Longer interval → Less overhead → Longer recovery time **Rule of Thumb**: Set interval to tolerable recovery time (data loss window). **Checkpoint Timeout**: - Should be >> checkpoint interval - Depends on state size and storage speed - Choose based on end-to-end latency, state size, and checkpoint storage throughput **Storage Selection (SeaTunnel Engine)**: - `localfile` (LocalFileStorage): local filesystem, non-HA - `hdfs` (HdfsStorage): Hadoop FileSystem-based backend; can work with HDFS/S3A/etc depending on Hadoop configuration ## 6. Performance Optimization ### 6.1 Async Checkpoint State snapshot doesn't block data processing: ```java public class AsyncSnapshotSupport { @Override public void snapshotState(long checkpointId) { // 1. Create snapshot of current state (fast, in-memory copy) StateSnapshot snapshot = createSnapshot(); // 2. Continue data processing (doesn't wait for serialization/upload) // ... // 3. Async serialize and upload CompletableFuture.runAsync(() -> { byte[] serialized = serialize(snapshot); checkpointStorage.upload(checkpointId, serialized); }, executorService); } } ``` ### 6.2 Incremental Checkpoint (Future) Only checkpoint changed state: ```java // Full checkpoint (first) Checkpoint 1: State = 1GB → Upload 1GB // Incremental checkpoints (subsequent) Checkpoint 2: State = 1.1GB → Upload 100MB (delta) Checkpoint 3: State = 1.05GB → Upload 0MB (deletion doesn't upload) ``` **Benefits**: - Reduce checkpoint time - Lower storage I/O - Faster checkpoint completion **Challenges**: - More complex state management - Need to track state changes - Restore requires chain of deltas ### 6.3 Local State Backend (Future) Store hot state locally, checkpoint only summary: ```java // RocksDB local state backend class RocksDBStateBackend { private final RocksDB rocksDB; // Fast local SSD @Override public void put(String key, byte[] value) { rocksDB.put(key.getBytes(), value); // Local write (fast) } @Override public byte[] snapshotState() { // Only checkpoint RocksDB snapshot reference return rocksDB.createCheckpoint().getBytes(); } } ``` ## 7. Best Practices ### 7.1 State Size Optimization **1. Keep State Small**: ```java // ❌ BAD: Buffer entire dataset class BadSourceReader { private List bufferedRows = new ArrayList<>(); // May be huge! List snapshotState() { return serialize(bufferedRows); // Huge state } } // ✅ GOOD: Track offset only class GoodSourceReader { private long currentOffset = 0; List snapshotState() { return serialize(currentOffset); // Small state } } ``` **2. Use Efficient Serialization**: - Prefer Protobuf, Kryo over Java serialization - Compress large state (gzip, snappy) ### 7.2 Monitoring **Key Metrics**: - `checkpoint_duration`: Time from trigger to completion - `checkpoint_size`: Size of persisted checkpoint - `checkpoint_failure_rate`: Percentage of failed checkpoints - `checkpoint_alignment_duration`: Time spent aligning barriers **Alerting**: - Alert if `checkpoint_duration` > threshold (e.g., 5 minutes) - Alert if `checkpoint_failure_rate` > 10% - Alert if no checkpoint completed in 2x interval ### 7.3 Troubleshooting **Problem**: Checkpoint timeout **Possible Causes**: 1. Task stuck (slow data processing) 2. Large state (slow serialization/upload) 3. Slow storage (network/disk I/O) 4. Barrier alignment slow (skewed data) **Solutions**: - Increase checkpoint timeout - Optimize state size - Use faster storage - Tune parallelism **Problem**: High checkpoint overhead **Possible Causes**: 1. Checkpoint interval too short 2. Large state size 3. Slow storage **Solutions**: - Increase checkpoint interval - Optimize state size - Enable incremental checkpoint (when available) ## 8. Related Resources - [Architecture Overview](../overview.md) - [Design Philosophy](../design-philosophy.md) - [Engine Architecture](../engine/engine-architecture.md) - [Sink Architecture](../api-design/sink-architecture.md) - [Exactly-Once Semantics](exactly-once.md) ## 9. References ### Key Source Files - [CheckpointCoordinator.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java) - [PendingCheckpoint.java](../../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/PendingCheckpoint.java) - [CheckpointStorage.java](../../../seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-api/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/api/CheckpointStorage.java) ### Academic Papers - Chandy, K. M., & Lamport, L. (1985). ["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Carbone, P., et al. (2017). ["State Management in Apache Flink"](http://www.vldb.org/pvldb/vol10/p1718-carbone.pdf) ### Further Reading - [Apache Flink Checkpointing](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/datastream/fault-tolerance/checkpointing/) - [Spark Structured Streaming Checkpointing](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovering-from-failures-with-checkpointing) ================================================ FILE: docs/en/architecture/fault-tolerance/exactly-once.md ================================================ --- sidebar_position: 2 title: Exactly-Once Semantics --- # Exactly-Once Semantics ## 1. Overview ### 1.1 Problem Background Distributed data processing faces fundamental delivery guarantees challenges: - **At-Most-Once**: Records may be lost (unacceptable for critical data) - **At-Least-Once**: Records may be duplicated (causes counting errors, double charges) - **Exactly-Once**: Each record processed exactly once (ideal but complex) **Real-World Impact**: ``` Scenario: Financial transaction processing At-Least-Once: Transaction $100 processed twice → User charged $200 ❌ Exactly-Once: Transaction $100 processed once → User charged $100 ✅ ``` ### 1.2 Design Goals SeaTunnel's exactly-once semantics aims to: 1. **Verifiable End-to-End Consistency**: With checkpoint boundaries + sink transactional/idempotent commits, avoid data loss/duplication under the documented failure model 2. **Transparent Implementation**: Framework handles complexity, users configure minimally 3. **Performance Efficiency**: Minimize overhead while maintaining guarantee 4. **Failure Resilience**: Maintain guarantee across task/worker/master failures 5. **Broad Applicability**: Support transactional sinks and also provide practical semantics for non-transactional sinks (e.g., idempotent writes / at-least-once) ### 1.3 Consistency Levels | Level | Guarantee | Use Cases | Implementation | |-------|-----------|-----------|----------------| | **At-Most-Once** | No duplicates, may lose | Non-critical logs | No retry | | **At-Least-Once** | No loss, may duplicate | Idempotent processing | Retry without transaction | | **Exactly-Once** | No loss, no duplicates | Financial, billing, audit | Checkpoint + 2PC | ## 2. Theoretical Foundation ### 2.1 Chandy-Lamport Algorithm **Concept**: Distributed snapshot without stopping the entire system. **Mechanism**: 1. Coordinator injects **barriers** (markers) into data streams 2. Upon receiving barrier, each operator: - Snapshots its local state - Forwards barrier downstream 3. When all operators snapshot, we have a **consistent global snapshot** **Key Property**: Snapshot represents a consistent cut across distributed system state. ### 2.2 Two-Phase Commit Protocol **Concept**: Atomic commitment across distributed participants. **Phases**: 1. **Prepare Phase**: All participants prepare (avoid making changes externally visible) 2. **Commit Phase**: Coordinator decides commit/abort, all participants execute **In SeaTunnel**: - **Prepare**: `SinkWriter.prepareCommit(checkpointId)` during checkpoint - **Commit**: `SinkCommitter.commit()` after checkpoint completes ## 3. Architecture for Exactly-Once ### 3.1 End-to-End Pipeline ``` ┌──────────────────────────────────────────────────────────────┐ │ Source │ │ • Read from external system │ │ • Track offsets/positions │ │ • Snapshot offsets in checkpoint │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ Checkpoint Barrier ┌──────────────────────────────────────────────────────────────┐ │ Transform │ │ • Process records │ │ • Snapshot transform state (if any) │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ Checkpoint Barrier ┌──────────────────────────────────────────────────────────────┐ │ Sink Writer │ │ • Buffer writes │ │ • prepareCommit(checkpointId) → Generate CommitInfo (PHASE 1)│ │ • Snapshot writer state │ └──────────────────────────┬───────────────────────────────────┘ │ │ CommitInfo ▼ ┌──────────────────────────────────────────────────────────────┐ │ CheckpointCoordinator │ │ • Collect all CommitInfos │ │ • Persist CompletedCheckpoint │ │ • Trigger commit phase │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ Sink Committer │ │ • commit(CommitInfos) → Apply changes (PHASE 2) │ │ • Must be idempotent │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ External Sink (Changes visible) ``` ### 3.2 Key Components **Source Offset Management**: ```java public class KafkaSourceReader { private Map currentOffsets; @Override public void pollNext(Collector output) { ConsumerRecords records = consumer.poll(timeout); for (ConsumerRecord record : records) { // Process record output.collect(convert(record)); // Track offset currentOffsets.put( new TopicPartition(record.topic(), record.partition()), record.offset() ); } } @Override public List snapshotState(long checkpointId) { // Snapshot offsets (will be committed after checkpoint completes) return Collections.singletonList(new KafkaSourceState(currentOffsets)); } @Override public void notifyCheckpointComplete(long checkpointId) { // Commit offsets to Kafka (idempotent) consumer.commitSync(currentOffsets); } } ``` **Sink Two-Phase Commit**: ```java public class JdbcExactlyOnceSinkWriter { private XAConnection xaConnection; private Xid currentXid; @Override public void write(SeaTunnelRow element) { if (currentXid == null) { // Start XA transaction currentXid = generateXid(); xaConnection.getXAResource().start(currentXid, XAResource.TMNOFLAGS); } // Execute INSERT (buffered in XA transaction) statement.executeUpdate(toSQL(element)); } @Override public Optional prepareCommit(long checkpointId) { if (currentXid == null) { return Optional.empty(); } // PHASE 1: Prepare (no side effects) xaConnection.getXAResource().end(currentXid, XAResource.TMSUCCESS); xaConnection.getXAResource().prepare(currentXid); // Return XID for committer XidInfo xidInfo = new XidInfo(currentXid); currentXid = null; return Optional.of(xidInfo); } } public class JdbcSinkCommitter { @Override public List commit(List commitInfos) { List failed = new ArrayList<>(); for (XidInfo xidInfo : commitInfos) { try { // PHASE 2: Commit (side effects now visible) xaConnection.getXAResource().commit(xidInfo.getXid(), false); } catch (XAException e) { if (e.errorCode == XAException.XAER_NOTA) { // Already committed (idempotent) LOG.info("XID already committed: {}", xidInfo); } else { failed.add(xidInfo); } } } return failed; } } ``` ## 4. Implementation Patterns ### 4.1 Transactional Sinks (XA) **Supported Systems**: MySQL, PostgreSQL, Oracle, SQL Server **Implementation**: ```java public class JdbcExactlyOnceSink implements SeaTunnelSink<...> { @Override public SinkWriter<...> createWriter(Context context) { // Enable XA transactions XADataSource xaDataSource = createXADataSource(); return new JdbcExactlyOnceSinkWriter(xaDataSource); } @Override public Optional> createCommitter() { return Optional.of(new JdbcSinkCommitter(xaDataSource)); } } ``` **Pros**: - Strong consistency guarantee - Automatic rollback on failure **Cons**: - Requires database XA support - Higher latency (2PC overhead) - Lock contention during prepare phase ### 4.2 Idempotent Sinks (Upsert) **Supported Systems**: Key-value stores, Elasticsearch (with doc ID) **Implementation**: ```java public class ElasticsearchSinkWriter { @Override public void write(SeaTunnelRow element) { // Use deterministic document ID String docId = extractPrimaryKey(element); IndexRequest request = new IndexRequest("my_index") .id(docId) // Idempotent key .source(toJson(element)); bulkProcessor.add(request); } @Override public Optional prepareCommit(long checkpointId) { // Flush bulk processor bulkProcessor.flush(); // No explicit commit needed (operations are idempotent) return Optional.empty(); } } ``` **Key**: Same primary key → same document → idempotent updates **Pros**: - No transaction overhead - Lower latency **Cons**: - Requires unique key - Cannot handle complex transactions ### 4.3 Log-Based Sinks (Kafka) **Implementation**: ```java public class KafkaSinkWriter { private KafkaProducer producer; private String transactionId; public KafkaSinkWriter() { // Enable Kafka transactions Properties props = new Properties(); props.put("transactional.id", generateTransactionalId()); props.put("enable.idempotence", "true"); producer = new KafkaProducer<>(props); producer.initTransactions(); } @Override public void write(SeaTunnelRow element) { if (!transactionStarted) { producer.beginTransaction(); transactionStarted = true; } ProducerRecord record = convert(element); producer.send(record); } @Override public Optional prepareCommit(long checkpointId) { // PHASE 1: Prepare (flush, but don't commit) producer.flush(); // Return transaction info return Optional.of(new KafkaCommitInfo(transactionId)); } } public class KafkaSinkCommitter { @Override public List commit(List commitInfos) { for (KafkaCommitInfo info : commitInfos) { // PHASE 2: Commit transaction producer.commitTransaction(); // Start new transaction for next checkpoint producer.beginTransaction(); } return Collections.emptyList(); } } ``` ### 4.4 File Sinks (Atomic Rename) **Implementation**: ```java public class FileSinkWriter { private String tempFilePath; private String finalFilePath; private OutputStream outputStream; @Override public void write(SeaTunnelRow element) { // Write to temporary file byte[] bytes = serialize(element); outputStream.write(bytes); } @Override public Optional prepareCommit(long checkpointId) { // PHASE 1: Close temp file (no rename yet) outputStream.close(); return Optional.of(new FileCommitInfo(tempFilePath, finalFilePath)); } } public class FileSinkCommitter { @Override public List commit(List commitInfos) { List failed = new ArrayList<>(); for (FileCommitInfo info : commitInfos) { // PHASE 2: Atomic rename (file becomes visible) boolean success = fileSystem.rename( new Path(info.getTempFilePath()), new Path(info.getFinalFilePath()) ); if (!success) { failed.add(info); } } return failed; } } ``` **Key**: Atomic rename ensures file is either fully visible or not visible. ## 5. Failure Scenarios and Recovery ### 5.1 Task Failure Before Checkpoint ``` Timeline: t0: Checkpoint N completed t1: Process records [1000-2000] t2: Task fails ❌ t3: Restore from Checkpoint N t4: Reprocess records [1000-2000] Result: ✅ No data loss (records reprocessed) ✅ No duplication (nothing committed before failure) ``` ### 5.2 Task Failure After prepareCommit ``` Timeline: t0: Checkpoint N in progress t1: SinkWriter.prepareCommit(checkpointId) → XID-123 prepared t2: Task fails ❌ (before commit) t3: Restore from Checkpoint N-1 t4: Reprocess records t5: New prepareCommit(checkpointId) → XID-124 prepared t6: Committer commits XID-124 Result: ✅ XID-123 never committed (automatically rolled back after timeout) ✅ XID-124 committed (correct data) ``` ### 5.3 Committer Failure During Commit ``` Timeline: t0: Checkpoint N completed t1: Committer starts committing [XID-100, XID-101, XID-102] t2: Commits XID-100 ✅ t3: Committer fails ❌ (XID-101, XID-102 not committed) t4: New committer retries [XID-100, XID-101, XID-102] t5: Commits XID-100 (already committed, idempotent) ✅ t6: Commits XID-101 ✅ t7: Commits XID-102 ✅ Result: ✅ All XIDs eventually committed ✅ No duplication (idempotent commit) ``` ### 5.4 Network Partition ``` Timeline: t0: SinkWriter prepares XID-200 t1: Checkpoint completes t2: Committer sends commit(XID-200) t3: Network partition ⚠️ (commit success, but ACK lost) t4: Committer retries commit(XID-200) t5: XID-200 already committed (idempotent) Result: ✅ Data committed exactly once ✅ Idempotency prevents duplication ``` ## 6. Idempotency Requirements ### 6.1 Why Idempotency Matters **Problem**: Network failures, retries, and failover can cause duplicate commit attempts. **Solution**: Committer operations must be idempotent. ```java // ❌ BAD: Non-idempotent (calling twice inserts twice) void commit(CommitInfo info) { statement.execute("INSERT INTO table VALUES (1, 'data')"); } // ✅ GOOD: Idempotent (calling twice has same effect as once) void commit(CommitInfo info) { statement.execute( "INSERT INTO table VALUES (1, 'data') " + "ON DUPLICATE KEY UPDATE data = VALUES(data)" ); } ``` ### 6.2 Implementing Idempotency **Strategy 1: Check-then-Execute** ```java public List commit(List commitInfos) { for (XidInfo xid : commitInfos) { // Check if already committed if (isCommitted(xid)) { LOG.info("XID already committed: {}", xid); continue; // Idempotent } // Commit and record xaResource.commit(xid, false); recordCommit(xid); } } ``` **Strategy 2: Database-Level Idempotency** ```sql -- Unique constraint ensures idempotency CREATE TABLE commits ( xid VARCHAR(255) PRIMARY KEY, committed_at TIMESTAMP ); -- Idempotent insert INSERT IGNORE INTO commits (xid, committed_at) VALUES ('XID-123', NOW()); ``` **Strategy 3: Natural Idempotency (XA)** ```java try { xaResource.commit(xid, false); } catch (XAException e) { if (e.errorCode == XAException.XAER_NOTA) { // Transaction not found = already committed return; // Idempotent } throw e; } ``` ## 7. Performance Considerations ### 7.1 Checkpoint Interval Trade-offs ``` Short Interval (10-30s): ✅ Fast recovery (less reprocessing) ❌ Higher overhead (frequent snapshots) ❌ More commit operations Long Interval (5-10min): ✅ Lower overhead (less frequent snapshots) ❌ Slower recovery (more reprocessing) ✅ Fewer commit operations ``` **Recommendation**: 60-120 seconds for most workloads ### 7.2 Batch Size Optimization ```java public class OptimizedSinkWriter { private static final int BATCH_SIZE = 1000; private List buffer = new ArrayList<>(); @Override public void write(SeaTunnelRow element) { buffer.add(element); if (buffer.size() >= BATCH_SIZE) { // Batch insert (amortize overhead) statement.executeBatch(); buffer.clear(); } } } ``` **Impact**: 1000x batch → ~10x throughput improvement ### 7.3 Async Checkpoint ```java public List snapshotState(long checkpointId) { // Quick: Copy state snapshot (in-memory) StateSnapshot snapshot = state.copy(); // Async: Serialize and upload CompletableFuture.runAsync(() -> { byte[] serialized = serialize(snapshot); checkpointStorage.upload(checkpointId, serialized); }); return snapshot; } ``` **Impact**: Data processing continues while snapshot uploads ## 8. Configuration ### 8.1 Enable Exactly-Once ```hocon env { # Checkpoint configuration checkpoint.interval = 60000 # 60 seconds checkpoint.timeout = 600000 # 10 minutes # Exactly-once mode (vs at-least-once) # This is implicit when using transactional sinks } ``` ### 8.2 Source Configuration **Kafka**: ```hocon source { Kafka { bootstrap.servers = "localhost:9092" topic = "my_topic" # Kafka consumer offset commit commit_on_checkpoint = true # Commit offsets after checkpoint } } ``` **JDBC**: ```hocon source { JDBC { url = "jdbc:mysql://..." # Query-based source (idempotent reprocessing) query = "SELECT * FROM table WHERE id >= ? AND id < ?" } } ``` ### 8.3 Sink Configuration **JDBC (XA)**: ```hocon sink { JDBC { url = "jdbc:mysql://..." # Enable XA transactions xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" is_exactly_once = true } } ``` **Kafka (Transactions)**: ```hocon sink { Kafka { bootstrap.servers = "localhost:9092" topic = "output_topic" # Kafka transactions transaction.id = "seatunnel-kafka-sink" enable.idempotence = true } } ``` ## 9. Testing Exactly-Once ### 9.1 Functional Test ```java @Test public void testExactlyOnce() { // 1. Insert 1000 records insertRecords(1000); // 2. Trigger checkpoint coordinator.triggerCheckpoint(); // 3. Simulate failure task.fail(); // 4. Restore and continue task.restore(checkpointId); insertRecords(1000); // Same records reprocessed // 5. Verify: Should have exactly 1000 records (no duplicates) assertEquals(1000, countRecordsInSink()); } ``` ### 9.2 Chaos Testing ```java @Test public void testExactlyOnceUnderChaos() { ChaosMonkey chaos = new ChaosMonkey() .killTaskRandomly(probability = 0.1) .injectNetworkDelay(maxDelayMs = 5000) .pauseCheckpointRandomly(probability = 0.05); // Run for 10 minutes with chaos runJobWithChaos(duration = 10 * 60 * 1000, chaos); // Verify: Input count == Output count assertEquals(countSource(), countSink()); } ``` ### 9.3 Monitoring Verification ``` Metrics to Track: source.records_read = 1,000,000 sink.records_written = 1,000,000 sink.records_committed = 1,000,000 ✅ All counts match → Exactly-once verified ``` ## 10. Best Practices ### 10.1 Choose Appropriate Sink **Use Transactional Sinks (XA) for**: - Financial transactions - Billing systems - Audit logs - Critical data **Use Idempotent Sinks for**: - High-throughput scenarios - Eventual consistency acceptable - No transaction support ### 10.2 Handle Poisoned Records ```java @Override public void write(SeaTunnelRow element) { try { statement.executeUpdate(toSQL(element)); } catch (SQLException e) { // Log poisoned record LOG.error("Failed to write record: {}", element, e); // Send to dead letter queue deadLetterQueue.send(element); // Don't fail entire checkpoint } } ``` ### 10.3 Monitor Checkpoint Health **Key Metrics**: - `checkpoint.duration`: Should be < 10% of interval - `checkpoint.failure_rate`: Should be < 1% - `checkpoint.size`: Monitor growth over time **Alerts**: ``` Alert if checkpoint.duration > 300s Alert if checkpoint.failure_rate > 5% Alert if no checkpoint in 2x interval ``` ## 11. Related Resources - [Checkpoint Mechanism](checkpoint-mechanism.md) - [Sink Architecture](../api-design/sink-architecture.md) - [Source Architecture](../api-design/source-architecture.md) - [Engine Architecture](../engine/engine-architecture.md) ## 12. References ### Academic Papers - Chandy & Lamport (1985): ["Distributed Snapshots"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Gray & Lamport (2006): ["Consensus on Transaction Commit"](https://lamport.azurewebsites.net/pubs/paxos-commit.pdf) - Carbone et al. (2017): ["State Management in Apache Flink"](http://www.vldb.org/pvldb/vol10/p1718-carbone.pdf) ### Further Reading - [Two-Phase Commit Protocol](https://en.wikipedia.org/wiki/Two-phase_commit_protocol) - [XA Transactions](https://pubs.opengroup.org/onlinepubs/009680699/toc.pdf) - [Kafka Exactly-Once](https://www.confluent.io/blog/exactly-once-semantics-are-possible-heres-how-apache-kafka-does-it/) ================================================ FILE: docs/en/architecture/features/multi-table.md ================================================ --- sidebar_position: 3 title: Multi-Table Synchronization --- # Multi-Table Synchronization Architecture ## 1. Overview ### 1.1 Problem Background Database migration and CDC scenarios often require synchronizing hundreds of tables: - **Resource Efficiency**: How to avoid creating one job per table? - **Consistent Snapshot**: How to ensure all tables start from same point in time? - **Schema Routing**: How to route data to correct target tables? - **Independent Schemas**: How to handle different schemas per table? - **Parallel Writing**: How to maximize throughput for multiple tables? ### 1.2 Design Goals SeaTunnel's multi-table synchronization aims to: 1. **Single Job, Multiple Tables**: Synchronize hundreds of tables in one job 2. **Resource Efficiency**: Share resources across tables 3. **Schema Independence**: Each table maintains its own schema 4. **Dynamic Routing**: Route records to correct sink based on table identity 5. **Horizontal Scalability**: Support replica writers for high throughput ### 1.3 Use Cases **Database Migration**: ```hocon source { MySQL-CDC { # Capture all tables in database database-name = "my_db" table-name = ".*" # Regex: all tables } } sink { JDBC { # Write to PostgreSQL url = "jdbc:postgresql://..." } } ``` **Multi-Table CDC**: ```hocon source { MySQL-CDC { table-name = "order_.*|user_.*|product_.*" # Multiple table patterns } } sink { Elasticsearch { # Different indices per table } } ``` ## 2. Core Abstractions ### 2.1 TablePath Unique identifier for routing records to tables. ```java public class TablePath implements Serializable { private final String databaseName; private final String schemaName; private final String tableName; // Unique string representation public String getFullName() { return String.join(".", databaseName, schemaName, tableName); } } ``` **Example**: ```java TablePath orderTable = TablePath.of("my_db", "public", "orders"); TablePath userTable = TablePath.of("my_db", "public", "users"); ``` ### 2.2 SeaTunnelRow with TableId Records carry table identity for routing. ```java public class SeaTunnelRow { private final String tableId; // TablePath serialized private final SeaTunnelRowKind rowKind; // INSERT, UPDATE, DELETE private final Object[] fields; public TablePath getTablePath() { return TablePath.deserialize(tableId); } } ``` ### 2.3 SinkIdentifier Unique identifier for sink writers (table + replica index). ```java public class SinkIdentifier implements Serializable { private final TableIdentifier tableIdentifier; private final int index; // Replica index // For multi-table: one identifier per table per replica // Example: (orders, 0), (orders, 1), (users, 0), (users, 1) } ``` ## 3. MultiTableSource Architecture ### 3.1 Structure ```java public class MultiTableSource implements SeaTunnelSource { // Underlying sources (one per table) private final Map> sources; // Produced catalog tables private final List catalogTables; } ``` ### 3.2 Creation ```java // From configuration MultiTableSource multiSource = MultiTableSource.builder() .addSource(orderTablePath, orderSource) .addSource(userTablePath, userSource) .addSource(productTablePath, productSource) .build(); ``` ### 3.3 Enumerator: Unified Split Assignment ```java public class MultiTableSourceSplitEnumerator { private final Map enumerators; @Override public void handleSplitRequest(int subtaskId) { // Round-robin across table enumerators for (Map.Entry entry : enumerators.entrySet()) { TablePath tablePath = entry.getKey(); SourceSplitEnumerator enumerator = entry.getValue(); // Request split from table enumerator enumerator.handleSplitRequest(subtaskId); } } @Override public void addReader(int subtaskId) { // Register reader with all table enumerators for (SourceSplitEnumerator enumerator : enumerators.values()) { enumerator.addReader(subtaskId); } } } ``` ### 3.4 Reader: Multi-Table Data Reading ```java public class MultiTableSourceReader { private final Map readers; private final Queue readOrder; // Round-robin queue @Override public void pollNext(Collector output) { if (readOrder.isEmpty()) { return; } // Round-robin read from tables TablePath currentTable = readOrder.poll(); SourceReader reader = readers.get(currentTable); // Read from current table reader.pollNext(new Collector() { @Override public void collect(SeaTunnelRow row) { // Tag row with table path row.setTableId(currentTable.serialize()); output.collect(row); } }); // Re-add to queue for next round readOrder.offer(currentTable); } @Override public void addSplits(List splits) { // Route splits to correct table readers for (SplitT split : splits) { TablePath tablePath = extractTablePath(split); SourceReader reader = readers.get(tablePath); reader.addSplits(Collections.singletonList(split)); // Add table to read order if not present if (!readOrder.contains(tablePath)) { readOrder.offer(tablePath); } } } } ``` ## 4. MultiTableSink Architecture ### 4.1 Structure ```java public class MultiTableSink implements SeaTunnelSink { // Underlying sinks (one per table) private final Map sinks; // Number of writer replicas per table private final int replicaNum; // Input catalog tables private final List catalogTables; } ``` ### 4.2 Writer: Multi-Table Writing with Replicas ```java public class MultiTableSinkWriter implements SinkWriter { // Writers per table (multiple replicas per table) private final Map> writers; // Replica count per table private final int replicaNum; // Context private final int writerIndex; // This writer's global index @Override public void write(IN element) throws IOException { SeaTunnelRow row = (SeaTunnelRow) element; // 1. Determine target table TablePath tablePath = row.getTablePath(); // 2. Select replica for this table (load balancing) int replicaIndex = selectReplica(tablePath, row); // 3. Get writer for (table, replica) SinkIdentifier identifier = new SinkIdentifier( new TableIdentifier(tablePath), replicaIndex ); SinkWriter writer = writers.get(identifier); // 4. Write to selected writer writer.write(element); } private int selectReplica(TablePath tablePath, SeaTunnelRow row) { // If primary key is available, route stably by primary key hash. Optional primaryKey = extractPrimaryKeyIfPresent(row); if (primaryKey.isPresent()) { return Math.abs(primaryKey.get().hashCode()) % replicaNum; } // Otherwise, distribute across replicas (no stable routing guarantee). return (int) (System.nanoTime() % replicaNum); } @Override public Optional prepareCommit(long checkpointId) throws IOException { // Collect commit info from all writers List allCommitInfos = new ArrayList<>(); for (SinkWriter writer : writers.values()) { Optional commitInfo = writer.prepareCommit(checkpointId); commitInfo.ifPresent(allCommitInfos::add); } // Wrap in multi-table commit info return Optional.of((CommitInfoT) new MultiTableCommitInfo(allCommitInfos)); } @Override public List snapshotState(long checkpointId) throws IOException { // Snapshot all writers List allStates = new ArrayList<>(); for (Map.Entry entry : writers.entrySet()) { List states = entry.getValue().snapshotState(checkpointId); // Tag states with sink identifier for recovery for (StateT state : states) { allStates.add(wrapWithIdentifier(entry.getKey(), state)); } } return allStates; } } ``` ### 4.3 Committer: Multi-Table Commit Coordination ```java public class MultiTableSinkCommitter implements SinkCommitter { // Committers per table private final Map> committers; @Override public List commit(List commitInfos) throws IOException { List failed = new ArrayList<>(); // Group commit infos by table Map> groupedInfos = groupByTable(commitInfos); // Commit per table for (Map.Entry> entry : groupedInfos.entrySet()) { TablePath tablePath = entry.getKey(); List tableCommitInfos = entry.getValue(); SinkCommitter committer = committers.get(tablePath); // Commit for this table List tableFailed = committer.commit(tableCommitInfos); failed.addAll(tableFailed); } return failed; } private Map> groupByTable(List commitInfos) { Map> grouped = new HashMap<>(); for (CommitInfoT commitInfo : commitInfos) { TablePath tablePath = extractTablePath(commitInfo); grouped.computeIfAbsent(tablePath, k -> new ArrayList<>()).add(commitInfo); } return grouped; } } ``` ## 5. Replica Mechanism ### 5.1 Why Replicas? **Problem**: Single writer per table becomes bottleneck for high-throughput tables. **Solution**: Multiple replica writers per table for parallel writing. ``` Without Replicas: orders table (1000 writes/sec) → [Single Writer] → Bottleneck With Replicas (replicaNum=4): orders table (1000 writes/sec) → [Writer 0] (250 writes/sec) → [Writer 1] (250 writes/sec) → [Writer 2] (250 writes/sec) → [Writer 3] (250 writes/sec) ``` ### 5.2 Replica Configuration ```hocon sink { JDBC { url = "..." # Multi-table configuration multi_table_sink_replica = 4 # replicas per table (applies to all tables) } } ``` ### 5.3 Replica Selection Strategies **Hash-Based (when primary key is available)**: ```java // Ensures same primary key always goes to same replica (order preservation) int replica = Math.abs(primaryKey.hashCode()) % replicaNum; ``` **Random (when primary key is not available)**: ```java // Distributes load across replicas (no stable routing guarantee) int replica = (int) (System.nanoTime() % replicaNum); ``` ## 6. Schema Management in Multi-Table ### 6.1 Independent Schemas Each table maintains its own schema: ```java public class MultiTableSink { // Schema per table private final Map catalogTables; public CatalogTable getCatalogTable(TablePath tablePath) { return catalogTables.get(tablePath); } } ``` ### 6.2 Schema Evolution Routing ```java public class MultiTableSinkWriter { public void handleSchemaChange(SchemaChangeEvent event) { // Route schema change to correct table writer TablePath tablePath = event.getTableId().toTablePath(); // Apply to all replicas of this table for (int i = 0; i < replicaNum; i++) { SinkIdentifier identifier = new SinkIdentifier( new TableIdentifier(tablePath), i ); SinkWriter writer = writers.get(identifier); writer.applySchemaChange(event); } } } ``` ## 7. Data Flow Example ### 7.1 Full Pipeline ``` ┌──────────────────────────────────────────────────────────────┐ │ MySQL CDC Source │ │ • Captures changes from 100 tables │ │ • Tags each row with TablePath │ └──────────────────────────┬───────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────┐ │ SeaTunnelRow (with TablePath) │ │ tableId: "my_db.public.orders" │ │ fields: [1, "order-001", 99.99] │ └─────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ MultiTableSinkWriter │ │ • Extracts TablePath from row │ │ • Selects replica (hash or random) │ │ • Routes to correct writer │ └──────────────────────────┬───────────────────────────────────┘ │ ┌──────────────────┼──────────────────┐ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ orders │ │ users │ │ products │ │ Writer 0 │ │ Writer 0 │ │ Writer 0 │ │ Writer 1 │ │ Writer 1 │ │ Writer 1 │ │ Writer 2 │ │ │ │ │ │ Writer 3 │ │ │ │ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ │ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ PostgreSQL │ │ PostgreSQL │ │ PostgreSQL │ │ orders │ │ users │ │ products │ └──────────────┘ └──────────────┘ └──────────────┘ ``` ### 7.2 Write Flow ```mermaid sequenceDiagram participant Source as MySQL CDC participant Writer as MultiTableSinkWriter participant OrderWriter as Order Writer (Replica 0) participant UserWriter as User Writer (Replica 0) participant PG as PostgreSQL Source->>Writer: Row(tableId="orders", data=[...]) Writer->>Writer: Extract TablePath("orders") Writer->>Writer: Select replica (hash) → 0 Writer->>OrderWriter: write(row) OrderWriter->>PG: INSERT INTO orders ... Source->>Writer: Row(tableId="users", data=[...]) Writer->>Writer: Extract TablePath("users") Writer->>Writer: Select replica (hash) → 0 Writer->>UserWriter: write(row) UserWriter->>PG: INSERT INTO users ... ``` ### 7.3 Checkpoint Flow ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Writer as MultiTableSinkWriter participant W1 as Order Writer 0 participant W2 as Order Writer 1 participant W3 as User Writer 0 CP->>Writer: triggerBarrier(checkpointId) Writer->>W1: prepareCommit(checkpointId) W1-->>Writer: CommitInfo(orders, replica=0) Writer->>W2: prepareCommit(checkpointId) W2-->>Writer: CommitInfo(orders, replica=1) Writer->>W3: prepareCommit(checkpointId) W3-->>Writer: CommitInfo(users, replica=0) Writer->>CP: ACK([CommitInfo1, CommitInfo2, CommitInfo3]) ``` ## 8. Performance Optimization ### 8.1 Replica Sizing **Rule of Thumb**: ``` replicaNum = ceil(Table Write Rate / Single Writer Throughput) Example: orders: 10,000 writes/sec Single writer: 2,500 writes/sec replicaNum = ceil(10,000 / 2,500) = 4 ``` ### 8.2 Table-Specific Replicas ```java // Future enhancement: different replicas per table Map replicaConfig = Map.of( TablePath.of("orders"), 4, // High-throughput table TablePath.of("users"), 2, // Medium-throughput TablePath.of("config"), 1 // Low-throughput ); ``` ### 8.3 Batch Writing ```java public class MultiTableSinkWriter { private final Map> buffers; private static final int BATCH_SIZE = 1000; @Override public void write(SeaTunnelRow row) { SinkIdentifier identifier = selectWriter(row); List buffer = buffers.computeIfAbsent( identifier, k -> new ArrayList<>() ); buffer.add(row); if (buffer.size() >= BATCH_SIZE) { flushBuffer(identifier, buffer); } } } ``` ## 9. Monitoring and Observability ### 9.1 Key Metrics **Per-Table Metrics**: - `table.{tableName}.records_written`: Records written per table - `table.{tableName}.bytes_written`: Bytes written per table - `table.{tableName}.write_latency`: Write latency per table **Per-Replica Metrics**: - `table.{tableName}.replica.{index}.records`: Records per replica - `table.{tableName}.replica.{index}.utilization`: Replica utilization **Global Metrics**: - `multitable.tables.total`: Total number of tables - `multitable.writers.total`: Total number of writers (tables × replicas) - `multitable.throughput`: Aggregate throughput ### 9.2 Monitoring Dashboard ``` Multi-Table Job: mysql-to-postgres Tables: 100 Writers: 250 (avg 2.5 replicas per table) Throughput: 50,000 records/sec Top Tables by Throughput: 1. orders: 15,000 rec/sec (4 replicas) 2. events: 10,000 rec/sec (4 replicas) 3. users: 5,000 rec/sec (2 replicas) ... Replica Distribution: orders: Replica 0: 3,750 rec/sec (25%) Replica 1: 3,800 rec/sec (25.3%) Replica 2: 3,700 rec/sec (24.7%) Replica 3: 3,750 rec/sec (25%) ``` ## 10. Best Practices ### 10.1 Table Selection Table include/exclude patterns are connector-specific. Please refer to the specific Source connector documentation for the supported option keys and formats. ### 10.2 Replica Configuration **Start Conservative**: ```hocon sink { JDBC { # Start with 1 replica, increase if bottleneck multi_table_sink_replica = 1 } } ``` **Monitor and Tune**: ```bash # Check if single replica is bottleneck # If write latency high → increase replicas multi_table_sink_replica = 2 # Double capacity ``` ### 10.3 Schema Management **Pre-create Target Tables**: ```sql -- Better: pre-create all target tables CREATE TABLE orders (...); CREATE TABLE users (...); CREATE TABLE products (...); ``` **Enable Auto-Create (Carefully)**: ```hocon sink { JDBC { # Auto-create missing tables schema-evolution { enabled = true auto-create-table = true } } } ``` ### 10.4 Error Handling Error tolerance and retry policies are typically connector-specific. Avoid relying on undocumented `multi-table.*` option keys unless they are defined by the connector you use. ## 11. Limitations and Considerations ### 11.1 Current Limitations **Shared Parallelism**: - All tables share same parallelism - Cannot set different parallelism per table **Fixed Replicas**: - Same replica count for all tables - High-throughput and low-throughput tables treated equally **Memory Overhead**: - Each writer maintains separate buffer - 100 tables × 4 replicas = 400 writers in memory ### 11.2 Workarounds **High-Throughput Tables**: ```hocon # Option 1: Separate job for hot tables job-1 { source { table-name = "orders" } } # Dedicated job job-2 { source { table-name = "user_.*|product_.*" } } # Rest ``` **Memory Optimization**: ```hocon # Reduce buffer size per writer sink { JDBC { batch-size = 500 # Smaller batches } } ``` ## 12. Future Enhancements ### 12.1 Dynamic Replicas Per-table replica overrides are not supported by the current `multi_table_sink_replica` option (it applies to all tables). If you need per-table replicas, it requires additional connector/framework capabilities. ### 12.2 Adaptive Replicas ```java // Auto-adjust replicas based on throughput if (table.getWriteRate() > threshold) { increaseReplicas(table); } else if (table.getWriteRate() < lowThreshold) { decreaseReplicas(table); } ``` ## 13. Related Resources - [CatalogTable and Metadata](../api-design/catalog-table.md) - [Sink Architecture](../api-design/sink-architecture.md) - [DAG Execution](../engine/dag-execution.md) - [Schema Evolution](../../introduction/concepts/schema-evolution.md) ## 14. References ### Key Source Files - [MultiTableSink.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/MultiTableSink.java) - [SinkIdentifier.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkIdentifier.java) - [TablePath.java](../../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java) ### Example Implementations - MySQL CDC Source: `seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/` - JDBC Sink: `seatunnel-connectors-v2/connector-jdbc/` ================================================ FILE: docs/en/architecture/overview.md ================================================ --- sidebar_position: 1 title: Architecture Overview --- # SeaTunnel Architecture Overview ## 1. Introduction ### 1.1 Design Goals SeaTunnel is designed as a distributed multimodal data integration tool with the following core objectives: - **Engine Independence**: Decouple connector logic from execution engines, enabling the same connectors to run on SeaTunnel Engine (Zeta), Apache Flink, or Apache Spark - **High Performance**: Support large-scale data synchronization with ultra-high-performance throughput and low latency - **Fault Tolerance**: Provide exactly-once semantics through distributed snapshots and two-phase commit - **Ease of Use**: Offer simple configuration and a rich connector ecosystem - **Extensibility**: Plugin-based architecture allowing easy addition of new connectors and transforms ### 1.2 Target Use Cases - **Batch Data Synchronization**: Large-scale batch data migration between heterogeneous data sources - **Real-time Data Integration**: Stream data capture and synchronization with CDC support - **Data Lake/Warehouse Ingestion**: Efficient data loading to data lakes (Iceberg, Hudi, Delta Lake) and warehouses - **Multi-table Synchronization**: Synchronizing multiple tables in a single job with schema evolution support ## 2. Overall Architecture SeaTunnel adopts a layered architecture that separates concerns and enables flexibility: ``` ┌─────────────────────────────────────────────────────────────────┐ │ User Configuration Layer │ │ (HOCON Config / SQL / Web UI) │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ SeaTunnel API Layer │ │ (Source API / Sink API / Transform API / Table API) │ │ │ │ • SeaTunnelSource • CatalogTable │ │ • SeaTunnelSink • TableSchema │ │ • SeaTunnelTransform • SchemaChangeEvent │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Connector Ecosystem │ │ │ │ [Jdbc] [Kafka] [MySQL-CDC] [Elasticsearch] [Iceberg] ... │ │ (Connector Ecosystem) │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ Translation Layer │ │ (Adapts SeaTunnel API to Engine-Specific API) │ │ │ │ • FlinkSource/FlinkSink • SparkSource/SparkSink │ │ • Context Adapters • Serialization Adapters │ └─────────────────────────────────────────────────────────────────┘ │ ┌─────────────────────┼─────────────────────┐ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ SeaTunnel │ │ Apache │ │ Apache │ │ Engine (Zeta)│ │ Flink │ │ Spark │ │ │ │ │ │ │ │ • Master │ │ • JobManager │ │ • Driver │ │ • Worker │ │ • TaskManager│ │ • Executor │ │ • Checkpoint │ │ • State │ │ • RDD/DS │ └──────────────┘ └──────────────┘ └──────────────┘ ``` ### 2.1 Layer Responsibilities | Layer | Responsibility | Key Components | |-------|---------------|----------------| | **Configuration Layer** | Job definition, parameter configuration | HOCON parser, SQL parser, config validation | | **API Layer** | Unified abstraction for connectors | Source/Sink/Transform interfaces, CatalogTable | | **Connector Layer** | Data source/sink implementations | Various connectors (JDBC, Kafka, CDC, etc.) | | **Translation Layer** | Engine-specific adaptation | Flink/Spark adapters, context wrappers | | **Engine Layer** | Job execution and resource management | Scheduling, fault tolerance, state management | ## 3. Core Components ### 3.1 SeaTunnel API The API layer provides engine-independent abstractions: #### Source API - **SeaTunnelSource**: Factory interface for creating readers and enumerators - **SourceSplitEnumerator**: Master-side component for split generation and assignment - **SourceReader**: Worker-side component for reading data from splits - **SourceSplit**: Minimal serializable unit representing a data partition **Key Design**: Separation of coordination (Enumerator) and execution (Reader) enables efficient parallel processing and fault tolerance. **Code Reference**: - [seatunnel-api/.../SeaTunnelSource.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SeaTunnelSource.java) - [seatunnel-api/.../SourceSplitEnumerator.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceSplitEnumerator.java) #### Sink API - **SeaTunnelSink**: Factory interface for creating writers and committers - **SinkWriter**: Worker-side component for writing data - **SinkCommitter**: Coordinator for commit operations from multiple writers - **SinkAggregatedCommitter**: Global coordinator for aggregated commits **Key Design**: Two-phase commit protocol (prepareCommit → commit) ensures exactly-once semantics. **Code Reference**: - [seatunnel-api/.../SeaTunnelSink.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SeaTunnelSink.java) - [seatunnel-api/.../SinkWriter.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java) #### Transform API - **SeaTunnelTransform**: Data transformation interface - **SeaTunnelMapTransform**: 1:1 transformation - **SeaTunnelFlatMapTransform**: 1:N transformation **Code Reference**: - [seatunnel-api/.../SeaTunnelTransform.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/transform/SeaTunnelTransform.java) #### Table API - **CatalogTable**: Complete table metadata (schema, partition keys, options) - **TableSchema**: Schema definition (columns, primary key, constraints) - **SchemaChangeEvent**: Represents DDL changes for schema evolution **Code Reference**: - [seatunnel-api/.../CatalogTable.java](../../seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java) ### 3.2 SeaTunnel Engine (Zeta) The native execution engine provides: #### Master Components - **CoordinatorService**: Manages all running JobMasters - **JobMaster**: Manages single job lifecycle, generates physical plans, coordinates checkpoints - **CheckpointCoordinator**: Coordinates distributed snapshots per pipeline - **ResourceManager**: Manages worker resources and slot allocation #### Worker Components - **TaskExecutionService**: Deploys and executes tasks - **SeaTunnelTask**: Executes Source/Transform/Sink logic - **FlowLifeCycle**: Manages lifecycle of Source/Transform/Sink components #### Execution Model ``` LogicalDag → PhysicalPlan → SubPlan (Pipeline) → PhysicalVertex → TaskGroup → SeaTunnelTask ``` **Code Reference**: - [seatunnel-engine/.../server/CoordinatorService.java](../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java) - [seatunnel-engine/.../server/master/JobMaster.java](../../seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java) ### 3.3 Translation Layer Enables engine portability through adapter pattern: - **FlinkSource/FlinkSink**: Adapts SeaTunnel API to Flink's Source/Sink interfaces - **SparkSource/SparkSink**: Adapts SeaTunnel API to Spark's RDD/Dataset interfaces - **Context Adapters**: Wraps engine-specific contexts (SourceReaderContext, SinkWriterContext) - **Serialization Adapters**: Bridges SeaTunnel and engine serialization mechanisms **Code Reference**: - [seatunnel-translation/.../flink/source/FlinkSource.java](../../seatunnel-translation/seatunnel-translation-flink/seatunnel-translation-flink-common/src/main/java/org/apache/seatunnel/translation/flink/source/FlinkSource.java) ### 3.4 Connector Ecosystem All connectors follow a standardized structure: ``` connector-[name]/ ├── src/main/java/.../ │ ├── [Name]Source.java # Implements SeaTunnelSource │ ├── [Name]SourceReader.java # Implements SourceReader │ ├── [Name]SourceSplitEnumerator.java │ ├── [Name]SourceSplit.java │ ├── [Name]Sink.java # Implements SeaTunnelSink │ ├── [Name]SinkWriter.java # Implements SinkWriter │ └── config/[Name]Config.java └── src/main/resources/META-INF/services/ ├── org.apache.seatunnel.api.table.factory.TableSourceFactory └── org.apache.seatunnel.api.table.factory.TableSinkFactory ``` **Discovery Mechanism**: Java SPI (Service Provider Interface) for dynamic connector loading. ## 4. Data Flow Model ### 4.1 Source Data Flow ``` Data Source │ ▼ ┌─────────────────────┐ │ SourceSplitEnumerator│ (Master Side) │ • Generate Splits │ │ • Assign to Readers │ └─────────────────────┘ │ (Split Assignment) ▼ ┌─────────────────────┐ │ SourceReader │ (Worker Side) │ • Read from Split │ │ • Emit Records │ └─────────────────────┘ │ ▼ SeaTunnelRow │ ▼ Transform Chain (Optional) │ ▼ SeaTunnelRow │ ▼ ┌─────────────────────┐ │ SinkWriter │ (Worker Side) │ • Buffer Records │ │ • Prepare Commit │ └─────────────────────┘ │ (CommitInfo) ▼ ┌─────────────────────┐ │ SinkCommitter │ (Coordinator) │ • Commit Changes │ └─────────────────────┘ │ ▼ Data Sink ``` ### 4.2 Split-based Parallelism - Data sources are divided into **Splits** (e.g., file blocks, database partitions, Kafka partitions) - Each **SourceReader** processes one or more splits independently - Dynamic split assignment enables load balancing and fault recovery - Split state is checkpointed for exactly-once processing ### 4.3 Pipeline Execution Jobs are divided into **Pipelines** (SubPlans): ``` Pipeline 1: [Source A] → [Transform 1] → [Sink A] ↓ Pipeline 2: [Source B] ───────→ [Transform 2] → [Sink B] ``` Each pipeline: - Has independent parallelism configuration - Maintains its own checkpoint coordinator - Can execute concurrently or sequentially ## 5. Job Execution Flow ### 5.1 Submission Phase ```mermaid sequenceDiagram participant Client participant CoordinatorService participant JobMaster participant ResourceManager Client->>CoordinatorService: Submit Job Config CoordinatorService->>CoordinatorService: Parse Config → LogicalDag CoordinatorService->>JobMaster: Create JobMaster JobMaster->>JobMaster: Generate PhysicalPlan JobMaster->>ResourceManager: Request Resources ResourceManager->>JobMaster: Allocate Slots JobMaster->>TaskExecutionService: Deploy Tasks ``` ### 5.2 Execution Phase 1. **Task Initialization** - Deploy tasks to allocated slots - Initialize Source/Transform/Sink components - Restore state from checkpoint (if recovering) 2. **Data Processing** - SourceReader pulls data from splits - Data flows through transform chain - SinkWriter buffers and writes data 3. **Checkpoint Coordination** - CheckpointCoordinator triggers checkpoint - Checkpoint barriers flow through data pipeline - Tasks snapshot their state - Coordinator collects acknowledgements 4. **Commit Phase** - SinkWriter prepares commit information - SinkCommitter coordinates commits - State persisted to checkpoint storage ### 5.3 State Machine **Task State Transitions**: ``` CREATED → INIT → WAITING_RESTORE → READY_START → STARTING → RUNNING ↓ FAILED ← ─────────────────────── → PREPARE_CLOSE → CLOSED ↓ CANCELED ``` **Job State Transitions**: ``` CREATED → SCHEDULED → RUNNING → FINISHED ↓ ↓ FAILED CANCELING → CANCELED ``` ## 6. Key Features ### 6.1 Fault Tolerance **Checkpoint Mechanism**: - Distributed snapshots inspired by Chandy-Lamport algorithm - Checkpoint barriers propagate through data streams - State stored in pluggable checkpoint storage (HDFS, S3, local) - Automatic recovery from latest successful checkpoint **Failover Strategy**: - Task-level failover: Restart failed task and related pipeline - Region-based failover: Minimize impact on unaffected tasks - Split reassignment: Failed splits redistributed to healthy workers ### 6.2 Exactly-Once Semantics **Two-Phase Commit Protocol**: 1. **Prepare Phase**: SinkWriter prepares commit info during checkpoint 2. **Commit Phase**: SinkCommitter commits after checkpoint completes 3. **Abort Handling**: Roll back on failure before commit **Idempotency**: SinkCommitter operations must be idempotent to handle retries ### 6.3 Dynamic Resource Management - **Slot-based Allocation**: Fine-grained resource management - **Tag-based Filtering**: Assign tasks to specific worker groups - **Load Balancing**: Multiple strategies (random, slot ratio, system load) - **Dynamic Scaling**: Add/remove workers without job restart (future) ### 6.4 Schema Evolution - **DDL Propagation**: Capture schema changes from source (ADD/DROP/MODIFY columns) - **Schema Mapping**: Transform schema changes through pipeline - **Dynamic Application**: Apply schema changes to sink tables - **Compatibility Checks**: Validate schema changes before application ### 6.5 Multi-Table Support - **Single Job, Multiple Tables**: Synchronize hundreds of tables in one job - **Table Routing**: Route records to correct sink based on TablePath - **Independent Schemas**: Each table maintains its own schema - **Replica Support**: Multiple writer replicas per table for higher throughput ## 7. Module Structure ``` seatunnel/ ├── seatunnel-api/ # Core API definitions │ ├── source/ # Source API │ ├── sink/ # Sink API │ ├── transform/ # Transform API │ └── table/ # Table and Schema API │ ├── seatunnel-connectors-v2/ # Connector implementations │ ├── connector-jdbc/ # JDBC connector │ ├── connector-kafka/ # Kafka connector │ ├── connector-cdc-mysql/ # MySQL CDC connector │ └── ... # connectors │ ├── seatunnel-transforms-v2/ # Transform implementations │ ├── transform-sql/ # SQL transform │ ├── transform-filter/ # Filter transform │ └── ... │ ├── seatunnel-engine/ # SeaTunnel Engine (Zeta) │ ├── seatunnel-engine-core/ # Core execution logic │ ├── seatunnel-engine-server/ # Server components (Master/Worker) │ └── seatunnel-engine-storage/ # Checkpoint storage │ ├── seatunnel-translation/ # Engine translation layers │ ├── seatunnel-translation-flink/ │ └── seatunnel-translation-spark/ │ ├── seatunnel-formats/ # Data format handlers │ ├── seatunnel-format-json/ │ ├── seatunnel-format-avro/ │ └── ... │ ├── seatunnel-core/ # Job submission and CLI └── seatunnel-e2e/ # End-to-end tests ``` ## 8. Design Principles ### 8.1 Separation of Concerns - **API vs Implementation**: Clean API boundaries enable multiple implementations - **Coordination vs Execution**: Enumerator/Committer (master) separate from Reader/Writer (worker) - **Logical vs Physical**: LogicalDag (user intent) separate from PhysicalPlan (execution details) ### 8.2 Plugin Architecture - **SPI-based Discovery**: Connectors loaded dynamically via Java SPI - **Class Loader Isolation**: Each connector uses isolated class loader - **Hot Pluggable**: Add connectors without rebuilding core ### 8.3 Engine Independence - **Unified API**: Same connector code runs on any engine - **Translation Layer**: Adapts API to engine specifics - **No Engine Leakage**: Connector developers don't need engine knowledge ### 8.4 Scalability - **Horizontal Scaling**: Add workers to increase throughput - **Split-based Parallelism**: Fine-grained parallel processing - **Stateless Workers**: Workers can be added/removed dynamically ### 8.5 Reliability - **Distributed Checkpoints**: Consistent snapshots across distributed tasks - **Incremental State**: Optimize checkpoint size for large state - **Exactly-Once Guarantee**: End-to-end consistency ## 9. Next Steps To dive deeper into specific architectural components: - [Design Philosophy](design-philosophy.md) - Core design principles and trade-offs - [Source Architecture](api-design/source-architecture.md) - Deep dive into Source API design - [Sink Architecture](api-design/sink-architecture.md) - Deep dive into Sink API design - [Engine Architecture](engine/engine-architecture.md) - SeaTunnel Engine internals - [Checkpoint Mechanism](fault-tolerance/checkpoint-mechanism.md) - Fault tolerance implementation For practical guides: - [How to Create Your Connector](../developer/how-to-create-your-connector.md) - [Quick Start](../getting-started/locally/quick-start-seatunnel-engine.md) ## 10. References ### 10.1 Related Concepts - [Apache Flink](https://flink.apache.org/) - Inspiration for checkpoint and state management - [Apache Kafka](https://kafka.apache.org/) - Consumer group model influenced split assignment - [Chandy-Lamport Algorithm](https://en.wikipedia.org/wiki/Chandy-Lamport_algorithm) - Distributed snapshot algorithm ================================================ FILE: docs/en/connectors/changelog/connector-activemq.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][connector][activemq] Remove duplicate dependencies (#8753)|https://github.com/apache/seatunnel/commit/da6241aa1c|2.3.10| |[improve] update activemq connector config option (#8580)|https://github.com/apache/seatunnel/commit/629f85b23a|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |Bump org.apache.activemq:activemq-client (#7323)|https://github.com/apache/seatunnel/commit/e23e3ac4ed|2.3.7| |[Feature] [Activemq] Added activemq sink (#7251)|https://github.com/apache/seatunnel/commit/f0cefbeb4a|2.3.7|
================================================ FILE: docs/en/connectors/changelog/connector-aerospike.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][Connector-V2] Add aerospike sink connector (#8821)|https://github.com/apache/seatunnel/commit/68ebf15cf6|2.3.11|
================================================ FILE: docs/en/connectors/changelog/connector-amazondynamodb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Core] Unify the aws-sdk-v2 version to 2.31.30 (#9698)|https://github.com/apache/seatunnel/commit/41c251cc8a|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix] Fix error log name for SourceSplitEnumerator implements class (#8817)|https://github.com/apache/seatunnel/commit/55ed90ecaf|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] update amazondynamodb connector (#8601)|https://github.com/apache/seatunnel/commit/a69efca0fd|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Code clean for AmazonDynamoDB connector (#5791)|https://github.com/apache/seatunnel/commit/a17dd7afc1|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[connector-v2] add amazondynamicdb source split (#5275)|https://github.com/apache/seatunnel/commit/740c14422d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve]Remove scheduler in Dynamodb sink (#5248)|https://github.com/apache/seatunnel/commit/9e033a824e|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Bugfix][AmazonDynamoDB] Fix the problem that all table data cannot be obtained (#5146)|https://github.com/apache/seatunnel/commit/09995159a0|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][API] env required option can not set default value (#3584)|https://github.com/apache/seatunnel/commit/c5a23024f6|2.3.0| |[Feature][Connector-V2][AmazonDynamoDB] Add Factory for AmazonDynamoDB (#3348)|https://github.com/apache/seatunnel/commit/a0068efdbf|2.3.0| |[Improve][Connector-V2][AmazonDynamoDB] Unified exception for AmazonDynamoDB source & sink connector (#3333)|https://github.com/apache/seatunnel/commit/17bc5adcef|2.3.0| |[Connector-V2] [Chore] Canonical name for AmazonDynamodb (#3321)|https://github.com/apache/seatunnel/commit/e216eb9a6b|2.3.0| |[Feature][Connector-V2] [Amazondynamodb Connector]add amazondynamodb source & sink connnector (#3166)|https://github.com/apache/seatunnel/commit/183bac02f0|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-amazonsqs.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Core] Unify the aws-sdk-v2 version to 2.31.30 (#9698)|https://github.com/apache/seatunnel/commit/41c251cc8a|2.3.12| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] amazon sqs connector update (#8602)|https://github.com/apache/seatunnel/commit/c747e02a98|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve] Remove all useless `prepare`, `getProducedType` method (#5741)|https://github.com/apache/seatunnel/commit/ed94fffbb9|2.3.4| |[Improve][Connector-V2] Change `amazonsqs` to `AmazonSqs` as connector identifier (#5742)|https://github.com/apache/seatunnel/commit/245705d0f7|2.3.4| |[Feature] [Connector-V2] Add connector amazonsqs (#5367)|https://github.com/apache/seatunnel/commit/7f75a8eafd|2.3.4|
================================================ FILE: docs/en/connectors/changelog/connector-assert.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add assert options (#8620)|https://github.com/apache/seatunnel/commit/b159cc0c75|2.3.10| |[Feature][API] Support timestamp with timezone offset (#8367)|https://github.com/apache/seatunnel/commit/e18bfeabd2|2.3.9| |[fix][connector-v2][connector-assert] Optimize Assert Sink verification method (#8356)|https://github.com/apache/seatunnel/commit/5c9159d7cd|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Transform-V2] Support transform with multi-table (#7628)|https://github.com/apache/seatunnel/commit/72c9c4576d|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Fix][API] Fix column length can not be long (#8039)|https://github.com/apache/seatunnel/commit/16cf632d3e|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Assert support multi-table check (#7687)|https://github.com/apache/seatunnel/commit/c4778a2497|2.3.8| |[Feature][Transform] Add embedding transform (#7534)|https://github.com/apache/seatunnel/commit/3310cfcd34|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Feature][Connector-V2][Assert] Support field type assert and field value equality assert for full data types (#6275)|https://github.com/apache/seatunnel/commit/576919bfab|2.3.4| |[Feature][Connector-V2][Assert] Support check the precision and scale of Decimal type. (#6110)|https://github.com/apache/seatunnel/commit/dd64ed52d4|2.3.4| |[Hotfix][SQL Transform] Fix cast to timestamp, date, time bug (#5812)|https://github.com/apache/seatunnel/commit/de181de02a|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[Fix] Fix log error when multi-table sink close (#5683)|https://github.com/apache/seatunnel/commit/fea4b6f268|2.3.4| |Support config tableIdentifier for schema (#5628)|https://github.com/apache/seatunnel/commit/652921fb75|2.3.4| |[Feature] Add `table-names` from FakeSource/Assert to produce/assert multi-table (#5604)|https://github.com/apache/seatunnel/commit/2c67cd8f3e|2.3.4| |[Improve] Remove useless ReadonlyConfig flatten feature (#5612)|https://github.com/apache/seatunnel/commit/243edfef3d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][connector-assert]support 'DECIMAL' type and fix 'Number' type precision issue (#5479)|https://github.com/apache/seatunnel/commit/d308e27733|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Transform] Add SimpleSQL transform plugin (#4148)|https://github.com/apache/seatunnel/commit/b914d49abf|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Assert] Unified exception for assert connector (#3331)|https://github.com/apache/seatunnel/commit/e74c9bc6fd|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2] Add Clickhouse and Assert Source/Sink Factory (#3306)|https://github.com/apache/seatunnel/commit/9e4a128381|2.3.0| |[Feature][Connector-v2] improve assert sink connector (#2844)|https://github.com/apache/seatunnel/commit/967fec0e93|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[API-DRAFT] [MERGE] update license and pom.xml|https://github.com/apache/seatunnel/commit/5ae8865b7c|2.2.0-beta| |add assert sink to Api draft (#2071)|https://github.com/apache/seatunnel/commit/fc640b52bd|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-cassandra.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] cassandra connector options (#8608)|https://github.com/apache/seatunnel/commit/d9201108cf|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Improve some connectors prepare check error message (#7465)|https://github.com/apache/seatunnel/commit/6930a25edd|2.3.8| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector V2] expose configurable options in Cassandra (#3681)|https://github.com/apache/seatunnel/commit/73f63a5044|2.3.2| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |[Improve][Connector-V2][Cassandra] Unified exception for cassandra source & sink connector (#3435)|https://github.com/apache/seatunnel/commit/28868797b7|2.3.0| |[Feature][Connector-V2][Cassandra] Add Cassandra Source And Sink Connector (#3229)|https://github.com/apache/seatunnel/commit/12268a6f4b|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-base.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][MySQL CDC] MySQL cdc support start by time (#9735)|https://github.com/apache/seatunnel/commit/b6c5d941b0|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Update catalog table schema of debezium json (#9525)|https://github.com/apache/seatunnel/commit/10cb84435b|2.3.12| |[Improve][Oracle-CDC] Fix oracle rename ddl event missing column type (#9314)|https://github.com/apache/seatunnel/commit/11a23af64c|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve][CDC] Filter heartbeat event (#8569)|https://github.com/apache/seatunnel/commit/1870653393|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][MySQL-CDC]fix recovery task failure caused by binlog deletion (#8587)|https://github.com/apache/seatunnel/commit/087087e592|2.3.10| |[Feature] [Postgre CDC]support array type (#8560)|https://github.com/apache/seatunnel/commit/021af147cc|2.3.10| |[Feature][MySQL-CDC] Support database/table wildcards scan read (#8323)|https://github.com/apache/seatunnel/commit/2116843ce8|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Transform-v2] Add metadata transform (#7899)|https://github.com/apache/seatunnel/commit/699d16552a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Fix][Connector-V2] Fix cdc use default value when value is null (#7950)|https://github.com/apache/seatunnel/commit/3b432125ae|2.3.9| |[Hotfix][CDC] Fix occasional database connection leak when read snapshot split (#7918)|https://github.com/apache/seatunnel/commit/a8d0d4ce77|2.3.9| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Fix][Connector-V2][CDC] SeaTunnelRowDebeziumDeserializationConverters NPE (#7119)|https://github.com/apache/seatunnel/commit/ae81879213|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[Hotfix][CDC] Fix split schema change stream (#7003)|https://github.com/apache/seatunnel/commit/0c3044e3f6|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Postgres-CDC/OpenGauss-CDC] Fix read data missing when restore (#6785)|https://github.com/apache/seatunnel/commit/67c32607e7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Chore] remove useless interface (#6746)|https://github.com/apache/seatunnel/commit/3c1aeb3785|2.3.6| |[Feature] Support listening for message delayed events in cdc source (#6634)|https://github.com/apache/seatunnel/commit/01159ec923|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Improve][CDC] Improve read performance when record not contains schema field (#6571)|https://github.com/apache/seatunnel/commit/e60beb28ec|2.3.5| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Bugfix][cdc base] Fix negative values in CDCRecordEmitDelay metric (#6259)|https://github.com/apache/seatunnel/commit/68978dbb4e|2.3.4| |[BugFix][CDC Base] Fix added columns cannot be parsed after job restore (#6118)|https://github.com/apache/seatunnel/commit/0c593a39e3|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve][CDC] Disable exactly_once by default to improve stability (#6244)|https://github.com/apache/seatunnel/commit/f47495554b|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |[Bugfix][CDC Base] Fix NPE caused by adding a table for restore job (#6145)|https://github.com/apache/seatunnel/commit/8d3f8e4627|2.3.4| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Bugfix][CDC base] Fix CDC job cannot consume incremental data After restore run (#625) (#6094)|https://github.com/apache/seatunnel/commit/37567ebb7e|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Improve][CDC] Disable memory buffering when `exactly_once` is turned off (#6017)|https://github.com/apache/seatunnel/commit/300a624c5b|2.3.4| |[Improve][Zeta] Remove assert key words (#5947)|https://github.com/apache/seatunnel/commit/dcb4549109|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Fix] Fix MultiTableSink restore failed when add new table (#5746)|https://github.com/apache/seatunnel/commit/21503bd771|2.3.4| |[improve][mysql-cdc] Optimize the default value range of mysql server-id to reduce conflicts. (#5550)|https://github.com/apache/seatunnel/commit/5174639463|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Hotfix][CDC] Fix thread-unsafe collection container in cdc enumerator (#5614)|https://github.com/apache/seatunnel/commit/b2f70fd40b|2.3.4| |[Improve][CDC] Use Source to output the CatalogTable (#5626)|https://github.com/apache/seatunnel/commit/3e6a20acfa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Fix]: fix the cdc bug about NPE when the original table deletes a field (#5579)|https://github.com/apache/seatunnel/commit/f5ed47795d|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][CDC] Support for preferring numeric fields as split keys (#5384)|https://github.com/apache/seatunnel/commit/c687050d88|2.3.4| |[Feature][Connector-V2][CDC] Support flink running cdc job (#4918)|https://github.com/apache/seatunnel/commit/5e378831ee|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[Bugfix][cdc] Fix mysql bit column to java byte (#4817)|https://github.com/apache/seatunnel/commit/aae3e913d0|2.3.3| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |[Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057)|https://github.com/apache/seatunnel/commit/0e4190ab2e|2.3.3| |[Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162)|https://github.com/apache/seatunnel/commit/4b4b5f9640|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3| |[Chore] Modify repeat des (#5088)|https://github.com/apache/seatunnel/commit/936afc2a9e|2.3.3| |[Feature][Connector-V2][cdc] Change the time zone to the default time zone (#5030)|https://github.com/apache/seatunnel/commit/3cff923a79|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877)|https://github.com/apache/seatunnel/commit/9a2efa51c7|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[Bug][CDC] Fix TemporalConversions (#4542)|https://github.com/apache/seatunnel/commit/d2094bf2e1|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Feature][CDC] Support add & dorp tables when restore cdc jobs (#4254)|https://github.com/apache/seatunnel/commit/add75d7d5d|2.3.1| |[Feature][CDC][Mysql] Support read database list (#4255)|https://github.com/apache/seatunnel/commit/3ca60c6fed|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Hotfix][Zeta] Fix shuffle checkpoint (#4224)|https://github.com/apache/seatunnel/commit/507ca85611|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Feature][API] Add Metrics for Connector-V2 (#4017)|https://github.com/apache/seatunnel/commit/32e1f91c7a|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][CDC] MySQL CDC supports deserialization of multi-tables (#4067)|https://github.com/apache/seatunnel/commit/21ef45fcca|2.3.1| |fix cdc option rule error (#4018)|https://github.com/apache/seatunnel/commit/ea160429df|2.3.1| |[Bug][CDC] Fix concurrent modify of splits (#3937)|https://github.com/apache/seatunnel/commit/29b04e2405|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Hotfix][SqlServer CDC] fix SqlServerCDC IT failure (#3807)|https://github.com/apache/seatunnel/commit/fd66de5f98|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0| |[doc][connector][cdc] add MySQL CDC Source doc (#3707)|https://github.com/apache/seatunnel/commit/555905b0b8|2.3.0| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[feature][connector][mysql-cdc] add MySQL CDC enumerator (#3481)|https://github.com/apache/seatunnel/commit/ff4b32dc28|2.3.0| |[feature][connector] add mysql cdc reader (#3455)|https://github.com/apache/seatunnel/commit/ae981df675|2.3.0| |[feature][connector][cdc] add cdc reader jdbc related (#3433)|https://github.com/apache/seatunnel/commit/7bf00fb19f|2.3.0| |[feature][connector][cdc] add CDC enumerator base classes (#3419)|https://github.com/apache/seatunnel/commit/9b1821f476|2.3.0| |[feature][Connector-v2][cdc] Add cdc base reader (#3407)|https://github.com/apache/seatunnel/commit/e454b80dcd|2.3.0| |[bigfix][Connector-v2][cdc] move version to 1.6.4 (#3389)|https://github.com/apache/seatunnel/commit/b50b543c3e|2.3.0| |[feature][connector][cdc] CDC base classes (#3363)|https://github.com/apache/seatunnel/commit/2586f305b4|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-mongodb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Mongo-CDC] Fix the issue where mongo isExactlyOnce defaults to true, causing room to malfunction (#9454)|https://github.com/apache/seatunnel/commit/814b19537c|2.3.12| |[Fix] [Mongo-cdc] Fallback to timestamp startup mode when resume token has expired (#8754)|https://github.com/apache/seatunnel/commit/afc990d84e|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Mongodb-CDC] Support multi-table read (#8029)|https://github.com/apache/seatunnel/commit/49cbaeb9b3|2.3.9| |[Bug][connectors-v2] fix mongodb bson convert exception (#8044)|https://github.com/apache/seatunnel/commit/b222c13f2f|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Transform-v2] Add metadata transform (#7899)|https://github.com/apache/seatunnel/commit/699d16552a|2.3.9| |[Bug][Connector-v2] MongoDB CDC Set SeatunnelRow's tableId (#7935)|https://github.com/apache/seatunnel/commit/f3970d6188|2.3.9| |[Improve] Add conditional of start.mode with timestamp in mongo cdc option rule (#6770)|https://github.com/apache/seatunnel/commit/65ae7782c9|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Fix][Connector-V2] Fix mongodb cdc start up mode option values not right (#6338)|https://github.com/apache/seatunnel/commit/c07f56fbc4|2.3.5| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Improve][CDC] Clean unused code (#5785)|https://github.com/apache/seatunnel/commit/b5a66d3dbe|2.3.4| |[Dependency]Bump org.apache.avro:avro (#5583)|https://github.com/apache/seatunnel/commit/bb791a6d9e|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Feature][CDC] Support MongoDB CDC running on flink (#5644)|https://github.com/apache/seatunnel/commit/8c569b1541|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[BUG][Connector-V2][Mongo-cdc] Incremental data kind error in snapshot phase (#5184)|https://github.com/apache/seatunnel/commit/ead1c5fd8c|2.3.3| |[Hotfix]Fix array index anomalies caused by #5057 (#5195)|https://github.com/apache/seatunnel/commit/1c33429506|2.3.3| |[Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162)|https://github.com/apache/seatunnel/commit/4b4b5f9640|2.3.3| |[Hotfix][Mongodb cdc] Solve startup resume token is negative (#5143)|https://github.com/apache/seatunnel/commit/e964c03dca|2.3.3| |[Hotfix]Fix mongodb cdc e2e instability (#5128)|https://github.com/apache/seatunnel/commit/6f30b29662|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-mysql.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][MySQL CDC] MySQL cdc support start by time (#9735)|https://github.com/apache/seatunnel/commit/b6c5d941b0|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Feature][Connectors-v2] Support Mysql8.4+ for mysql-cdc (#9720)|https://github.com/apache/seatunnel/commit/e338743927|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Feature][Connector-V2] Jdbc mysql support read tinyint(1) to byte(tinyint) (#9373)|https://github.com/apache/seatunnel/commit/7b87aa6f12|2.3.12| |[Improve][CDC] Filter ddl for snapshot phase (#8911)|https://github.com/apache/seatunnel/commit/641cc72f2f|2.3.10| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][mysql-cdc] Fix GTIDs on startup to correctly recover from checkpoint (#8528)|https://github.com/apache/seatunnel/commit/82e4096c08|2.3.10| |[Feature][MySQL-CDC] Support database/table wildcards scan read (#8323)|https://github.com/apache/seatunnel/commit/2116843ce8|2.3.9| |[Feature][Jdbc] Support sink ddl for postgresql (#8276)|https://github.com/apache/seatunnel/commit/353bbd21a1|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Hotfix][CDC] Fix package name spelling mistake (#7415)|https://github.com/apache/seatunnel/commit/469112fa64|2.3.8| |[Hotfix][MySQL-CDC] Fix ArrayIndexOutOfBoundsException in mysql binlog read (#7381)|https://github.com/apache/seatunnel/commit/40c5f313eb|2.3.7| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[Hotfix][MySQL-CDC] Fix read gbk varchar chinese garbled characters (#7046)|https://github.com/apache/seatunnel/commit/4e4d2b8ee5|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve][mysql-cdc] Support mysql 5.5 versions (#6710)|https://github.com/apache/seatunnel/commit/058f5594a3|2.3.6| |[Improve][mysql-cdc] Fallback to desc table when show create table failed (#6701)|https://github.com/apache/seatunnel/commit/6f74663c08|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Feature][formats][ogg] Support read ogg format message #4201 (#4225)|https://github.com/apache/seatunnel/commit/7728e241e8|2.3.4| |[Improve][CDC] Clean unused code (#5785)|https://github.com/apache/seatunnel/commit/b5a66d3dbe|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[improve][mysql-cdc] Optimize the default value range of mysql server-id to reduce conflicts. (#5550)|https://github.com/apache/seatunnel/commit/5174639463|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Improve][connector-cdc-mysql] avoid listing tables under unnecessary databases (#5365)|https://github.com/apache/seatunnel/commit/3e5d018b35|2.3.4| |[Improve][Docs] Refactor MySQL-CDC docs (#5302)|https://github.com/apache/seatunnel/commit/74530a0461|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[improve] [CDC Base] Add some split parameters to the optionRule (#5161)|https://github.com/apache/seatunnel/commit/94fd6755e6|2.3.3| |[Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057)|https://github.com/apache/seatunnel/commit/0e4190ab2e|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Feature][Connector-V2][mysql cdc] Conversion of tinyint(1) to bool is supported (#5105)|https://github.com/apache/seatunnel/commit/86b1b7e31a|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3| |[Bugfix][connector-cdc-mysql] Fix listener not released when BinlogClient reuse (#5011)|https://github.com/apache/seatunnel/commit/3287b1d852|2.3.3| |[BugFix] [Connector-V2] [MySQL-CDC] serverId from int to long (#5033) (#5035)|https://github.com/apache/seatunnel/commit/4abc80e111|2.3.3| |[Hotfix][CDC] Fix jdbc connection leak for mysql (#5037)|https://github.com/apache/seatunnel/commit/738925ba10|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve][CDC]change driver scope to provider (#5002)|https://github.com/apache/seatunnel/commit/745c0b9e92|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[feature][catalog] Support for multiplexing connections (#4550)|https://github.com/apache/seatunnel/commit/41277d7f78|2.3.2| |[BugFix][Mysql-CDC] Fix Time data type is empty when reading from MySQL CDC (#4670)|https://github.com/apache/seatunnel/commit/e4f973daf7|2.3.2| |[Improve][CDC] Optimize jdbc fetch-size options (#4352)|https://github.com/apache/seatunnel/commit/fbb60ce1be|2.3.1| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Improve][CDC][MySQL] Ennable binlog watermark compare (#4293)|https://github.com/apache/seatunnel/commit/b22fb259c8|2.3.1| |[Feature][CDC][Mysql] Support read database list (#4255)|https://github.com/apache/seatunnel/commit/3ca60c6fed|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][jdbc] Reduce jdbc options configuration (#4218)|https://github.com/apache/seatunnel/commit/ddd8f808b5|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Feature][CDC] Support batch processing on multiple-table shuffle flow (#4116)|https://github.com/apache/seatunnel/commit/919653d83e|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][CDC] MySQL CDC supports deserialization of multi-tables (#4067)|https://github.com/apache/seatunnel/commit/21ef45fcca|2.3.1| |fix cdc option rule error (#4018)|https://github.com/apache/seatunnel/commit/ea160429df|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0| |[feature][e2e][cdc] add mysql cdc container (#3667)|https://github.com/apache/seatunnel/commit/7696ba1551|2.3.0| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[feature][connector][mysql-cdc] add MySQL CDC enumerator (#3481)|https://github.com/apache/seatunnel/commit/ff4b32dc28|2.3.0| |[bugfix][connector-v2] fix cdc mysql reader err (#3465)|https://github.com/apache/seatunnel/commit/1b406b5a31|2.3.0| |[feature][connector] add mysql cdc reader (#3455)|https://github.com/apache/seatunnel/commit/ae981df675|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-opengauss.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Connector-V2] Support opengauss-cdc (#7433)|https://github.com/apache/seatunnel/commit/81b73515a7|2.3.8|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-oracle.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Oracle cdc not update transaction commit when LOB enabled (#9412)|https://github.com/apache/seatunnel/commit/2a25bae6f6|2.3.12| |[Improve][Oracle-CDC] Remove duplicate load table names (#9357)|https://github.com/apache/seatunnel/commit/90e88cafc5|2.3.12| |[Feature][Connector-JDBC] Supprot read Oracle BLOB data as string instead of bytes (#9305)|https://github.com/apache/seatunnel/commit/454a88f81a|2.3.11| |[Improve][CDC] Filter ddl for snapshot phase (#8911)|https://github.com/apache/seatunnel/commit/641cc72f2f|2.3.10| |[Improve][Oracle-CDC] Support ReadOnlyLogWriterFlushStrategy (#8912)|https://github.com/apache/seatunnel/commit/6aebdc0384|2.3.10| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[hotfix] [connector-cdc-oracle ] support read partition table (#8265)|https://github.com/apache/seatunnel/commit/91b86b2faf|2.3.9| |[Improve][E2E] improve oracle e2e (#8292)|https://github.com/apache/seatunnel/commit/9f761b9d32|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Hotfix][CDC] Fix package name spelling mistake (#7415)|https://github.com/apache/seatunnel/commit/469112fa64|2.3.8| |[Improve][Connector-v2] Optimize the count table rows for jdbc-oracle and oracle-cdc (#7248)|https://github.com/apache/seatunnel/commit/0d08b20061|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Fix] Fix ConnectorSpecificationCheckTest failed (#6828)|https://github.com/apache/seatunnel/commit/52d1020eb7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve] Improve read table schema in cdc connector (#6702)|https://github.com/apache/seatunnel/commit/a8c6cc6e0c|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Fix][Oracle-CDC] Fix invalid split key when no primary key (#6251)|https://github.com/apache/seatunnel/commit/b83c40a6f6|2.3.4| |[Feature][Oracle-CDC] Support custom table primary key (#6216)|https://github.com/apache/seatunnel/commit/ae4240ca6b|2.3.4| |[Improve][Oracle-CDC] Clean unused code (#6212)|https://github.com/apache/seatunnel/commit/919a91032a|2.3.4| |[Hotfix][Oracle-CDC] Fix state recovery error when switching a single table to multiple tables (#6211)|https://github.com/apache/seatunnel/commit/74cfe1995f|2.3.4| |[Hotfix][Oracle-CDC] Fix jdbc setFetchSize error (#6210)|https://github.com/apache/seatunnel/commit/b7f06ec6d9|2.3.4| |[Feature][Oracle-CDC] Support read no primary key table (#6209)|https://github.com/apache/seatunnel/commit/3cb34c2b71|2.3.4| |[Feature][Connector-V2][Oracle-cdc]Support for oracle cdc (#5196)|https://github.com/apache/seatunnel/commit/aaef22b31b|2.3.4|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-postgres.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Fix postgres cdc with debezium_json format can not parse number without scale (#9052)|https://github.com/apache/seatunnel/commit/29cf3a76c7|2.3.11| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Improve][PostgreSQL CDC]-PostgresSourceOptions description error (#7813)|https://github.com/apache/seatunnel/commit/57f47c2064|2.3.9| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Postgres-CDC/OpenGauss-CDC] Fix read data missing when restore (#6785)|https://github.com/apache/seatunnel/commit/67c32607e7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve] Improve read table schema in cdc connector (#6702)|https://github.com/apache/seatunnel/commit/a8c6cc6e0c|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature][Connector]update pgsql-cdc publication for add table (#6309)|https://github.com/apache/seatunnel/commit/2ad7d65236|2.3.5| |[Improve][Postgres-CDC] Fix name typos (#6248)|https://github.com/apache/seatunnel/commit/2462f1c5f7|2.3.4| |[Improve][Postgres-CDC] Update jdbc fetchsize (#6245)|https://github.com/apache/seatunnel/commit/c25beb9f8a|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-sqlserver.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Improve][Connector-V2] Fix SqlServer cdc memory leak (#8083)|https://github.com/apache/seatunnel/commit/69cd4ae1a2|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Connector-V2] SqlServer support user-defined type (#7706)|https://github.com/apache/seatunnel/commit/fb89033273|2.3.8| |[Improve][Connector-V2] Optimize sqlserver package structure (#7715)|https://github.com/apache/seatunnel/commit/9720f118e5|2.3.8| |[Hotfix][CDC] Fix package name spelling mistake (#7415)|https://github.com/apache/seatunnel/commit/469112fa64|2.3.8| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve] Improve read table schema in cdc connector (#6702)|https://github.com/apache/seatunnel/commit/a8c6cc6e0c|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Improve] Support `int identity` type in sql server (#6186)|https://github.com/apache/seatunnel/commit/1a8da1c843|2.3.4| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Hotfix][Jdbc] Fix jdbc setFetchSize error (#6005)|https://github.com/apache/seatunnel/commit/d41af8a6ed|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Improve][CDC] Clean unused code (#5785)|https://github.com/apache/seatunnel/commit/b5a66d3dbe|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[improve][connector-v2][sqlserver-cdc]Unified sqlserver TypeUtils type conversion mode (#5668)|https://github.com/apache/seatunnel/commit/75b814bc3d|2.3.4| |[feature][connector-cdc-sqlserver] add dataType datetimeoffset (#5548)|https://github.com/apache/seatunnel/commit/0cf63eed6d|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[improve] [CDC Base] Add some split parameters to the optionRule (#5161)|https://github.com/apache/seatunnel/commit/94fd6755e6|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve][CDC]change driver scope to provider (#5002)|https://github.com/apache/seatunnel/commit/745c0b9e92|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877)|https://github.com/apache/seatunnel/commit/9a2efa51c7|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][CDC] Optimize jdbc fetch-size options (#4352)|https://github.com/apache/seatunnel/commit/fbb60ce1be|2.3.1| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][SQLServer-CDC] Add sqlserver cdc optionRule (#4019)|https://github.com/apache/seatunnel/commit/78df503392|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-cdc-tidb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Correct typo in batch-size-per-scan option key (#9434)|https://github.com/apache/seatunnel/commit/6cf258127f|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Feature] Support tidb cdc connector source #7199 (#7477)|https://github.com/apache/seatunnel/commit/87ec786bd6|2.3.8|
================================================ FILE: docs/en/connectors/changelog/connector-cdc.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][MySQL CDC] MySQL cdc support start by time (#9735)|https://github.com/apache/seatunnel/commit/b6c5d941b0|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Feature][Connectors-v2] Support Mysql8.4+ for mysql-cdc (#9720)|https://github.com/apache/seatunnel/commit/e338743927|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Update catalog table schema of debezium json (#9525)|https://github.com/apache/seatunnel/commit/10cb84435b|2.3.12| |[Fix][Mongo-CDC] Fix the issue where mongo isExactlyOnce defaults to true, causing room to malfunction (#9454)|https://github.com/apache/seatunnel/commit/814b19537c|2.3.12| |[Fix][Connector-V2] Correct typo in batch-size-per-scan option key (#9434)|https://github.com/apache/seatunnel/commit/6cf258127f|2.3.12| |[Fix][Connector-V2] Oracle cdc not update transaction commit when LOB enabled (#9412)|https://github.com/apache/seatunnel/commit/2a25bae6f6|2.3.12| |[Feature][Connector-V2] Jdbc mysql support read tinyint(1) to byte(tinyint) (#9373)|https://github.com/apache/seatunnel/commit/7b87aa6f12|2.3.12| |[Improve][Oracle-CDC] Remove duplicate load table names (#9357)|https://github.com/apache/seatunnel/commit/90e88cafc5|2.3.12| |[Improve][Oracle-CDC] Fix oracle rename ddl event missing column type (#9314)|https://github.com/apache/seatunnel/commit/11a23af64c|2.3.11| |[Feature][Connector-JDBC] Supprot read Oracle BLOB data as string instead of bytes (#9305)|https://github.com/apache/seatunnel/commit/454a88f81a|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] Fix postgres cdc with debezium_json format can not parse number without scale (#9052)|https://github.com/apache/seatunnel/commit/29cf3a76c7|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Fix] [Mongo-cdc] Fallback to timestamp startup mode when resume token has expired (#8754)|https://github.com/apache/seatunnel/commit/afc990d84e|2.3.10| |[Improve][CDC] Filter ddl for snapshot phase (#8911)|https://github.com/apache/seatunnel/commit/641cc72f2f|2.3.10| |[Improve][Oracle-CDC] Support ReadOnlyLogWriterFlushStrategy (#8912)|https://github.com/apache/seatunnel/commit/6aebdc0384|2.3.10| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve][CDC] Filter heartbeat event (#8569)|https://github.com/apache/seatunnel/commit/1870653393|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][MySQL-CDC]fix recovery task failure caused by binlog deletion (#8587)|https://github.com/apache/seatunnel/commit/087087e592|2.3.10| |[Fix][mysql-cdc] Fix GTIDs on startup to correctly recover from checkpoint (#8528)|https://github.com/apache/seatunnel/commit/82e4096c08|2.3.10| |[Feature] [Postgre CDC]support array type (#8560)|https://github.com/apache/seatunnel/commit/021af147cc|2.3.10| |[Feature][MySQL-CDC] Support database/table wildcards scan read (#8323)|https://github.com/apache/seatunnel/commit/2116843ce8|2.3.9| |[hotfix] [connector-cdc-oracle ] support read partition table (#8265)|https://github.com/apache/seatunnel/commit/91b86b2faf|2.3.9| |[Feature][Jdbc] Support sink ddl for postgresql (#8276)|https://github.com/apache/seatunnel/commit/353bbd21a1|2.3.9| |[Improve][E2E] improve oracle e2e (#8292)|https://github.com/apache/seatunnel/commit/9f761b9d32|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Feature][Mongodb-CDC] Support multi-table read (#8029)|https://github.com/apache/seatunnel/commit/49cbaeb9b3|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Improve][Connector-V2] Fix SqlServer cdc memory leak (#8083)|https://github.com/apache/seatunnel/commit/69cd4ae1a2|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Bug][connectors-v2] fix mongodb bson convert exception (#8044)|https://github.com/apache/seatunnel/commit/b222c13f2f|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Transform-v2] Add metadata transform (#7899)|https://github.com/apache/seatunnel/commit/699d16552a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Bug][Connector-v2] MongoDB CDC Set SeatunnelRow's tableId (#7935)|https://github.com/apache/seatunnel/commit/f3970d6188|2.3.9| |[Fix][Connector-V2] Fix cdc use default value when value is null (#7950)|https://github.com/apache/seatunnel/commit/3b432125ae|2.3.9| |[Hotfix][CDC] Fix occasional database connection leak when read snapshot split (#7918)|https://github.com/apache/seatunnel/commit/a8d0d4ce77|2.3.9| |[Improve][PostgreSQL CDC]-PostgresSourceOptions description error (#7813)|https://github.com/apache/seatunnel/commit/57f47c2064|2.3.9| |[Feature][Connector-V2] SqlServer support user-defined type (#7706)|https://github.com/apache/seatunnel/commit/fb89033273|2.3.8| |[Improve][Connector-V2] Optimize sqlserver package structure (#7715)|https://github.com/apache/seatunnel/commit/9720f118e5|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Feature] Support tidb cdc connector source #7199 (#7477)|https://github.com/apache/seatunnel/commit/87ec786bd6|2.3.8| |[Feature][Connector-V2] Support opengauss-cdc (#7433)|https://github.com/apache/seatunnel/commit/81b73515a7|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Hotfix][CDC] Fix package name spelling mistake (#7415)|https://github.com/apache/seatunnel/commit/469112fa64|2.3.8| |[Hotfix][MySQL-CDC] Fix ArrayIndexOutOfBoundsException in mysql binlog read (#7381)|https://github.com/apache/seatunnel/commit/40c5f313eb|2.3.7| |[Improve][Connector-v2] Optimize the count table rows for jdbc-oracle and oracle-cdc (#7248)|https://github.com/apache/seatunnel/commit/0d08b20061|2.3.6| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Fix][Connector-V2][CDC] SeaTunnelRowDebeziumDeserializationConverters NPE (#7119)|https://github.com/apache/seatunnel/commit/ae81879213|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[Hotfix][MySQL-CDC] Fix read gbk varchar chinese garbled characters (#7046)|https://github.com/apache/seatunnel/commit/4e4d2b8ee5|2.3.6| |[Hotfix][CDC] Fix split schema change stream (#7003)|https://github.com/apache/seatunnel/commit/0c3044e3f6|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Postgres-CDC/OpenGauss-CDC] Fix read data missing when restore (#6785)|https://github.com/apache/seatunnel/commit/67c32607e7|2.3.6| |[Improve] Add conditional of start.mode with timestamp in mongo cdc option rule (#6770)|https://github.com/apache/seatunnel/commit/65ae7782c9|2.3.6| |[Fix] Fix ConnectorSpecificationCheckTest failed (#6828)|https://github.com/apache/seatunnel/commit/52d1020eb7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Chore] remove useless interface (#6746)|https://github.com/apache/seatunnel/commit/3c1aeb3785|2.3.6| |[Improve][mysql-cdc] Support mysql 5.5 versions (#6710)|https://github.com/apache/seatunnel/commit/058f5594a3|2.3.6| |[Improve] Improve read table schema in cdc connector (#6702)|https://github.com/apache/seatunnel/commit/a8c6cc6e0c|2.3.6| |[Improve][mysql-cdc] Fallback to desc table when show create table failed (#6701)|https://github.com/apache/seatunnel/commit/6f74663c08|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Feature] Support listening for message delayed events in cdc source (#6634)|https://github.com/apache/seatunnel/commit/01159ec923|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Improve][CDC] Improve read performance when record not contains schema field (#6571)|https://github.com/apache/seatunnel/commit/e60beb28ec|2.3.5| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Fix][Connector-V2] Fix mongodb cdc start up mode option values not right (#6338)|https://github.com/apache/seatunnel/commit/c07f56fbc4|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Feature][Connector]update pgsql-cdc publication for add table (#6309)|https://github.com/apache/seatunnel/commit/2ad7d65236|2.3.5| |[Fix][Oracle-CDC] Fix invalid split key when no primary key (#6251)|https://github.com/apache/seatunnel/commit/b83c40a6f6|2.3.4| |[Bugfix][cdc base] Fix negative values in CDCRecordEmitDelay metric (#6259)|https://github.com/apache/seatunnel/commit/68978dbb4e|2.3.4| |[Improve][Postgres-CDC] Fix name typos (#6248)|https://github.com/apache/seatunnel/commit/2462f1c5f7|2.3.4| |[BugFix][CDC Base] Fix added columns cannot be parsed after job restore (#6118)|https://github.com/apache/seatunnel/commit/0c593a39e3|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve][CDC] Disable exactly_once by default to improve stability (#6244)|https://github.com/apache/seatunnel/commit/f47495554b|2.3.4| |[Improve][Postgres-CDC] Update jdbc fetchsize (#6245)|https://github.com/apache/seatunnel/commit/c25beb9f8a|2.3.4| |[Improve] Support `int identity` type in sql server (#6186)|https://github.com/apache/seatunnel/commit/1a8da1c843|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |[Feature][Oracle-CDC] Support custom table primary key (#6216)|https://github.com/apache/seatunnel/commit/ae4240ca6b|2.3.4| |[Improve][Oracle-CDC] Clean unused code (#6212)|https://github.com/apache/seatunnel/commit/919a91032a|2.3.4| |[Hotfix][Oracle-CDC] Fix state recovery error when switching a single table to multiple tables (#6211)|https://github.com/apache/seatunnel/commit/74cfe1995f|2.3.4| |[Hotfix][Oracle-CDC] Fix jdbc setFetchSize error (#6210)|https://github.com/apache/seatunnel/commit/b7f06ec6d9|2.3.4| |[Feature][Oracle-CDC] Support read no primary key table (#6209)|https://github.com/apache/seatunnel/commit/3cb34c2b71|2.3.4| |[Feature][Connector-V2][Oracle-cdc]Support for oracle cdc (#5196)|https://github.com/apache/seatunnel/commit/aaef22b31b|2.3.4| |[Bugfix][CDC Base] Fix NPE caused by adding a table for restore job (#6145)|https://github.com/apache/seatunnel/commit/8d3f8e4627|2.3.4| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Bugfix][CDC base] Fix CDC job cannot consume incremental data After restore run (#625) (#6094)|https://github.com/apache/seatunnel/commit/37567ebb7e|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Hotfix][Jdbc] Fix jdbc setFetchSize error (#6005)|https://github.com/apache/seatunnel/commit/d41af8a6ed|2.3.4| |[Improve][CDC] Disable memory buffering when `exactly_once` is turned off (#6017)|https://github.com/apache/seatunnel/commit/300a624c5b|2.3.4| |[Improve][Zeta] Remove assert key words (#5947)|https://github.com/apache/seatunnel/commit/dcb4549109|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Feature][formats][ogg] Support read ogg format message #4201 (#4225)|https://github.com/apache/seatunnel/commit/7728e241e8|2.3.4| |[Improve][CDC] Clean unused code (#5785)|https://github.com/apache/seatunnel/commit/b5a66d3dbe|2.3.4| |[Fix] Fix MultiTableSink restore failed when add new table (#5746)|https://github.com/apache/seatunnel/commit/21503bd771|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[improve][mysql-cdc] Optimize the default value range of mysql server-id to reduce conflicts. (#5550)|https://github.com/apache/seatunnel/commit/5174639463|2.3.4| |[improve][connector-v2][sqlserver-cdc]Unified sqlserver TypeUtils type conversion mode (#5668)|https://github.com/apache/seatunnel/commit/75b814bc3d|2.3.4| |[Dependency]Bump org.apache.avro:avro (#5583)|https://github.com/apache/seatunnel/commit/bb791a6d9e|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |[feature][connector-cdc-sqlserver] add dataType datetimeoffset (#5548)|https://github.com/apache/seatunnel/commit/0cf63eed6d|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Hotfix][CDC] Fix thread-unsafe collection container in cdc enumerator (#5614)|https://github.com/apache/seatunnel/commit/b2f70fd40b|2.3.4| |[Feature][CDC] Support MongoDB CDC running on flink (#5644)|https://github.com/apache/seatunnel/commit/8c569b1541|2.3.4| |[Improve][CDC] Use Source to output the CatalogTable (#5626)|https://github.com/apache/seatunnel/commit/3e6a20acfa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Fix]: fix the cdc bug about NPE when the original table deletes a field (#5579)|https://github.com/apache/seatunnel/commit/f5ed47795d|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][CDC] Support for preferring numeric fields as split keys (#5384)|https://github.com/apache/seatunnel/commit/c687050d88|2.3.4| |[Feature][Connector-V2][CDC] Support flink running cdc job (#4918)|https://github.com/apache/seatunnel/commit/5e378831ee|2.3.4| |[Improve][connector-cdc-mysql] avoid listing tables under unnecessary databases (#5365)|https://github.com/apache/seatunnel/commit/3e5d018b35|2.3.4| |[Improve][Docs] Refactor MySQL-CDC docs (#5302)|https://github.com/apache/seatunnel/commit/74530a0461|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[BUG][Connector-V2][Mongo-cdc] Incremental data kind error in snapshot phase (#5184)|https://github.com/apache/seatunnel/commit/ead1c5fd8c|2.3.3| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[Bugfix][cdc] Fix mysql bit column to java byte (#4817)|https://github.com/apache/seatunnel/commit/aae3e913d0|2.3.3| |[Hotfix]Fix array index anomalies caused by #5057 (#5195)|https://github.com/apache/seatunnel/commit/1c33429506|2.3.3| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |[improve] [CDC Base] Add some split parameters to the optionRule (#5161)|https://github.com/apache/seatunnel/commit/94fd6755e6|2.3.3| |[Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057)|https://github.com/apache/seatunnel/commit/0e4190ab2e|2.3.3| |[Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162)|https://github.com/apache/seatunnel/commit/4b4b5f9640|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Hotfix][Mongodb cdc] Solve startup resume token is negative (#5143)|https://github.com/apache/seatunnel/commit/e964c03dca|2.3.3| |[Hotfix]Fix mongodb cdc e2e instability (#5128)|https://github.com/apache/seatunnel/commit/6f30b29662|2.3.3| |[Feature][Connector-V2][mysql cdc] Conversion of tinyint(1) to bool is supported (#5105)|https://github.com/apache/seatunnel/commit/86b1b7e31a|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3| |[Chore] Modify repeat des (#5088)|https://github.com/apache/seatunnel/commit/936afc2a9e|2.3.3| |[Bugfix][connector-cdc-mysql] Fix listener not released when BinlogClient reuse (#5011)|https://github.com/apache/seatunnel/commit/3287b1d852|2.3.3| |[Feature][Connector-V2][cdc] Change the time zone to the default time zone (#5030)|https://github.com/apache/seatunnel/commit/3cff923a79|2.3.3| |[BugFix] [Connector-V2] [MySQL-CDC] serverId from int to long (#5033) (#5035)|https://github.com/apache/seatunnel/commit/4abc80e111|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[Hotfix][CDC] Fix jdbc connection leak for mysql (#5037)|https://github.com/apache/seatunnel/commit/738925ba10|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve][CDC]change driver scope to provider (#5002)|https://github.com/apache/seatunnel/commit/745c0b9e92|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877)|https://github.com/apache/seatunnel/commit/9a2efa51c7|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[feature][catalog] Support for multiplexing connections (#4550)|https://github.com/apache/seatunnel/commit/41277d7f78|2.3.2| |[BugFix][Mysql-CDC] Fix Time data type is empty when reading from MySQL CDC (#4670)|https://github.com/apache/seatunnel/commit/e4f973daf7|2.3.2| |[Bug][CDC] Fix TemporalConversions (#4542)|https://github.com/apache/seatunnel/commit/d2094bf2e1|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][CDC] Optimize jdbc fetch-size options (#4352)|https://github.com/apache/seatunnel/commit/fbb60ce1be|2.3.1| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Feature][CDC] Support add & dorp tables when restore cdc jobs (#4254)|https://github.com/apache/seatunnel/commit/add75d7d5d|2.3.1| |[Improve][CDC][MySQL] Ennable binlog watermark compare (#4293)|https://github.com/apache/seatunnel/commit/b22fb259c8|2.3.1| |[Feature][CDC][Mysql] Support read database list (#4255)|https://github.com/apache/seatunnel/commit/3ca60c6fed|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Hotfix][Zeta] Fix shuffle checkpoint (#4224)|https://github.com/apache/seatunnel/commit/507ca85611|2.3.1| |[improve][jdbc] Reduce jdbc options configuration (#4218)|https://github.com/apache/seatunnel/commit/ddd8f808b5|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Feature][API] Add Metrics for Connector-V2 (#4017)|https://github.com/apache/seatunnel/commit/32e1f91c7a|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Feature][CDC] Support batch processing on multiple-table shuffle flow (#4116)|https://github.com/apache/seatunnel/commit/919653d83e|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][CDC] MySQL CDC supports deserialization of multi-tables (#4067)|https://github.com/apache/seatunnel/commit/21ef45fcca|2.3.1| |[Improve][Connector-V2][SQLServer-CDC] Add sqlserver cdc optionRule (#4019)|https://github.com/apache/seatunnel/commit/78df503392|2.3.1| |fix cdc option rule error (#4018)|https://github.com/apache/seatunnel/commit/ea160429df|2.3.1| |[Bug][CDC] Fix concurrent modify of splits (#3937)|https://github.com/apache/seatunnel/commit/29b04e2405|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][SqlServer CDC] fix SqlServerCDC IT failure (#3807)|https://github.com/apache/seatunnel/commit/fd66de5f98|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0| |[doc][connector][cdc] add MySQL CDC Source doc (#3707)|https://github.com/apache/seatunnel/commit/555905b0b8|2.3.0| |[feature][e2e][cdc] add mysql cdc container (#3667)|https://github.com/apache/seatunnel/commit/7696ba1551|2.3.0| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[feature][connector][mysql-cdc] add MySQL CDC enumerator (#3481)|https://github.com/apache/seatunnel/commit/ff4b32dc28|2.3.0| |[bugfix][connector-v2] fix cdc mysql reader err (#3465)|https://github.com/apache/seatunnel/commit/1b406b5a31|2.3.0| |[feature][connector] add mysql cdc reader (#3455)|https://github.com/apache/seatunnel/commit/ae981df675|2.3.0| |[feature][connector][cdc] add cdc reader jdbc related (#3433)|https://github.com/apache/seatunnel/commit/7bf00fb19f|2.3.0| |[feature][connector][cdc] add CDC enumerator base classes (#3419)|https://github.com/apache/seatunnel/commit/9b1821f476|2.3.0| |[feature][Connector-v2][cdc] Add cdc base reader (#3407)|https://github.com/apache/seatunnel/commit/e454b80dcd|2.3.0| |[bigfix][Connector-v2][cdc] move version to 1.6.4 (#3389)|https://github.com/apache/seatunnel/commit/b50b543c3e|2.3.0| |[feature][connector][cdc] CDC base classes (#3363)|https://github.com/apache/seatunnel/commit/2586f305b4|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-clickhouse.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Connector-Clickhouse] improve ck batch parallel read by using last batch row sorting value approach, instead of limit offset. (#9801)|https://github.com/apache/seatunnel/commit/5e9990afd5| dev | |[Feature][Connector-Clickhouse] Support Clickhouse multi table source read (#9704)|https://github.com/apache/seatunnel/commit/6e323743ea|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix][Connector-clickhouse] Fix SeaTunnelRow tableId set error (#9585)|https://github.com/apache/seatunnel/commit/01f1caa6fb|2.3.12| |[Improve][connector-clickhouse] Clickhouse support parallelism reading schema (#9446)|https://github.com/apache/seatunnel/commit/3ee0fab3a8|2.3.12| |[Feature][Connector-V2] Support multi-table sink feature for ClickHouse (#9301)|https://github.com/apache/seatunnel/commit/3524895136|2.3.11| |[Fix][Connector-V2] Fix the problem that missing options configuration when building ClickHouse Nodes (#9277)|https://github.com/apache/seatunnel/commit/051d19c3a9|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Fix] [Clickhouse] Parallelism makes data duplicate (#8916)|https://github.com/apache/seatunnel/commit/45345f2738|2.3.10| |[Fix][Connector-V2]Fix Descriptions for CUSTOM_SQL in Connector (#8778)|https://github.com/apache/seatunnel/commit/96b610eb7e|2.3.10| |[improve] update clickhouse connector config option (#8755)|https://github.com/apache/seatunnel/commit/b964189b75|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[hotfix] fix exceptions caused by operator priority in connector-clickhouse when using sharding_key (#8162)|https://github.com/apache/seatunnel/commit/5560e3dab2|2.3.9| |[Imporve][ClickhouseFile] Directly connect to each shard node to obtain the corresponding path (#8449)|https://github.com/apache/seatunnel/commit/757641bada|2.3.9| |[Feature][ClickhouseFile] Support add publicKey to identity (#8351)|https://github.com/apache/seatunnel/commit/287b8c8219|2.3.9| |[Improve][ClickhouseFile] Improve rsync log output (#8332)|https://github.com/apache/seatunnel/commit/179223e3c2|2.3.9| |[Improve][ClickhouseFile] Added attach sql log for better debugging (#8315)|https://github.com/apache/seatunnel/commit/ade428c5fa|2.3.9| |[Chore] delete chinese desc in code (#8306)|https://github.com/apache/seatunnel/commit/a50a8b925f|2.3.9| |[Improve][ClickhouseFile Connector] Unified specifying clickhouse file generation path (#8302)|https://github.com/apache/seatunnel/commit/455f1ed760|2.3.9| |[Improve][ClickhouseFile] Clickhouse supports option configuration when connecting to shard nodes (#8297)|https://github.com/apache/seatunnel/commit/1ded1b6206|2.3.9| |[Imporve][ClickhouseFile] Improve clickhousefile generation parameter configuration (#8293)|https://github.com/apache/seatunnel/commit/753e058fee|2.3.9| |[Improve][ClickhouseFile] ClickhouseFile Connector's rsync transmission supports specifying users (#8236)|https://github.com/apache/seatunnel/commit/e012bd0a4f|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Fix][Connecotr-V2] Fix clickhouse sink does not support composite primary key (#8021)|https://github.com/apache/seatunnel/commit/24d0542595|2.3.9| |[Improve] update clickhouse connector, use factory to create source/sink (#7946)|https://github.com/apache/seatunnel/commit/b69fceceee|2.3.9| |[Fix][Connector-V2] Fixed clickhouse connectors cannot stop under multiple parallelism (#7921)|https://github.com/apache/seatunnel/commit/8d9c6a3714|2.3.9| |Bump commons-io:commons-io from 2.11.0 to 2.14.0 in /seatunnel-connectors-v2/connector-clickhouse (#7784)|https://github.com/apache/seatunnel/commit/f4393a02bf|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Improve some connectors prepare check error message (#7465)|https://github.com/apache/seatunnel/commit/6930a25edd|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Connector-V2][Clickhouse] Add clickhouse.config to the source connector (#7143)|https://github.com/apache/seatunnel/commit/f7994d9ae9|2.3.6| |[Improve] Make ClickhouseFileSinker support tables containing materialized columns (#6956)|https://github.com/apache/seatunnel/commit/87c6adcc2e|2.3.6| |[Improve] [Clickhouse] Remove check when set allow_experimental_lightweight_delete false(#6727) (#6728)|https://github.com/apache/seatunnel/commit/b25e1b1ae5|2.3.6| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve] Speed up ClickhouseFile Local generate a mmap object (#5822)|https://github.com/apache/seatunnel/commit/cf39e29dad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Hotfix][connector-v2][clickhouse] Fixed an out-of-order BUG with output data fields of clickhouse-sink (#5346)|https://github.com/apache/seatunnel/commit/fce9ddaa2b|2.3.4| |[Bugfix][Clickhouse] Fix clickhouse sink flush bug (#5448)|https://github.com/apache/seatunnel/commit/cef03f6673|2.3.4| |[Hotfix][Clickhouse] Fix clickhouse old version compatibility (#5326)|https://github.com/apache/seatunnel/commit/1da49f5a2b|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Connector-V2][Clickhouse] Add clickhouse connector time zone key,default system time zone (#5078)|https://github.com/apache/seatunnel/commit/309b58d12d|2.3.3| |[Bugfix]fix clickhouse source connector read Nullable() type is not null,example:Nullable(Float64) while value is null the result is 0.0 (#5080)|https://github.com/apache/seatunnel/commit/cf3d0bba2e|2.3.3| |[Feature][Connector-V2][Clickhouse] clickhouse writes with checkpoints (#4999)|https://github.com/apache/seatunnel/commit/f8fefa1e57|2.3.3| |[Hotfix][Connector-V2][ClickhouseFile] Fix ClickhouseFile write file failed when field value is null (#4937)|https://github.com/apache/seatunnel/commit/06671474ca|2.3.3| |[Hotfix][connector-clickhouse] fix get clickhouse local table name with closing bracket from distributed table engineFull (#4710)|https://github.com/apache/seatunnel/commit/e5e0cba26d|2.3.2| |[Bug] [Connector-V2] Clickhouse File Connector failed to sink to table with settings like storage_policy (#4172)|https://github.com/apache/seatunnel/commit/e120dc44bc|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Bug] [Connector-V2] Clickhouse File Connector not support split mode for write data to all shards of distributed table (#4035)|https://github.com/apache/seatunnel/commit/3f1dcfc915|2.3.1| |[Hotfix][Connector-V2] Fix connector source snapshot state NPE (#4027)|https://github.com/apache/seatunnel/commit/e39c4988cc|2.3.1| |[Hotfix][Connector-v2][Clickhouse] Fix clickhouse write cdc changelog update event (#3951)|https://github.com/apache/seatunnel/commit/67e6027970|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Improve][Connector-V2][Clickhouse] Improve performance (#3910)|https://github.com/apache/seatunnel/commit/aeceb855f6|2.3.1| |[Improve] [Connector-V2] Remove Clickhouse Fields Config (#3826)|https://github.com/apache/seatunnel/commit/74704c362a|2.3.1| |[Improve][Connector-V2][clickhouse] Special characters in column names are supported (#3881)|https://github.com/apache/seatunnel/commit/9069609c17|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Bug] [Connector-V2] Fix ClickhouseFile Committer Serializable Problems (#3803)|https://github.com/apache/seatunnel/commit/1b26192cb3|2.3.1| |[feature][connector-v2][clickhouse] Support write cdc changelog event in clickhouse sink (#3653)|https://github.com/apache/seatunnel/commit/6093c213bf|2.3.0| |[Connector-V2] [Clickhouse] Improve Clickhouse File Connector (#3416)|https://github.com/apache/seatunnel/commit/e07e9a7cc2|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Clickhouse] Unified exception for Clickhouse source & sink connector (#3563)|https://github.com/apache/seatunnel/commit/04e1743d9e|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[Feature][Connector-V2][Clickhouse]Optimize clickhouse connector data type inject (#3471)|https://github.com/apache/seatunnel/commit/9bd0fc8ee2|2.3.0| |[improve][connector-v2][clickhouse] Fix DoubleInjectFunction (#3441)|https://github.com/apache/seatunnel/commit/9781a6a385|2.3.0| |[feature][api] add option validation for the ReadonlyConfig (#3417)|https://github.com/apache/seatunnel/commit/4f824fea36|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2] Add Clickhouse and Assert Source/Sink Factory (#3306)|https://github.com/apache/seatunnel/commit/9e4a128381|2.3.0| |[Improve][Clickhouse-V2] Clickhouse Support Geo type (#3141)|https://github.com/apache/seatunnel/commit/01cdc4e336|2.3.0| |[Improve][Connector-V2][Clickhouse] Support nest type and array (#3047)|https://github.com/apache/seatunnel/commit/97b5727ec6|2.3.0| |[Feature][Connector-V2-Clickhouse] Clickhouse Source random use host when config multi-host (#3108)|https://github.com/apache/seatunnel/commit/c9583b7f63|2.3.0-beta| |[Improve] [Clickhouse-V2] Clickhouse Support Int128,Int256 Type (#3067)|https://github.com/apache/seatunnel/commit/e118ccea0a|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Connector-V2] [Clickhouse] Fix Clickhouse Type Mapping and Spark Map reconvert Bug (#2767)|https://github.com/apache/seatunnel/commit/f0a1f5013a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V1 & V2] Support unauthorized ClickHouse (#2393)|https://github.com/apache/seatunnel/commit/0e4e2b1230|2.2.0-beta| |[Feature][connector] clickhousefile sink connector support non-root username for fileTransfer (#2263)|https://github.com/apache/seatunnel/commit/704661f1fd|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Bug] [connector-v2] When outputting data to clickhouse, a ClassCastException was encountered (#2160)|https://github.com/apache/seatunnel/commit/a3a2b5d189|2.2.0-beta| |[API-DRAFT] [MERGE] fix merge error|https://github.com/apache/seatunnel/commit/736ac01c89|2.2.0-beta| |merge dev to api-draft|https://github.com/apache/seatunnel/commit/d265597c64|2.2.0-beta| |[api-draft][connector] support Rsync to transfer clickhouse data file (#2080)|https://github.com/apache/seatunnel/commit/02a41902a8|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-cloudberry.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector] Add Apache Cloudberry Support (#8985)|https://github.com/apache/seatunnel/commit/b6f82c1|dev|
================================================ FILE: docs/en/connectors/changelog/connector-common.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][Connector-File-Hadoop]Support multi table sink feature for HdfsFile (#9651)|https://github.com/apache/seatunnel/commit/bb4f743c05|2.3.12| |[Fix][Connector-V2] ArrowToSeatunnelRowReader convertSeatunnelRowValue add handle Second TIMESTAMP type (#9393)|https://github.com/apache/seatunnel/commit/0555f8520b|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Fix][Connector-v2] Add DateMilliConvertor to Convert DateMilliVector into Default Timezone (#8736)|https://github.com/apache/seatunnel/commit/7b8298a8a4|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[Feature][Core] Support read arrow data (#8137)|https://github.com/apache/seatunnel/commit/4710ea0f8d|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Fix][Connector-V2] Fix AbstractSingleSplitReader lock useless when do checkpoint (#7764)|https://github.com/apache/seatunnel/commit/a941b91628|2.3.9| |[Improve][Core] Move MultiTableSink to seatunnel-api module (#7243)|https://github.com/apache/seatunnel/commit/cc5949988b|2.3.6| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Fix] Fix MultiTableWriterRunnable can not catch Throwable error (#6734)|https://github.com/apache/seatunnel/commit/d826cf9ece|2.3.6| |[Fix][Connector-v2] Fix the sql statement error of create table for doris and starrocks (#6679)|https://github.com/apache/seatunnel/commit/88263cd69f|2.3.6| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Improve] Improve MultiTableSinkWriter prepare commit performance (#6495)|https://github.com/apache/seatunnel/commit/2086b0e8a6|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Hotfix][Zeta] Fix job can not restore when last checkpoint failed (#6193)|https://github.com/apache/seatunnel/commit/59f60b9f73|2.3.4| |[Improve] Extend `SupportResourceShare` to spark/flink (#5847)|https://github.com/apache/seatunnel/commit/c69da93b87|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Fix] Fix MultiTableSinkWriter thread index always 1 (#5832)|https://github.com/apache/seatunnel/commit/a6523ba368|2.3.4| |[Improve][Connector-V2][Common] Remove assert key word. (#5915)|https://github.com/apache/seatunnel/commit/d757dcd1fc|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Fix] Fix MultiTableSink restore failed when add new table (#5746)|https://github.com/apache/seatunnel/commit/21503bd771|2.3.4| |[feature][connector-jdbc]Add Save Mode function and Connector-JDBC (MySQL) connector has been realized (#5663)|https://github.com/apache/seatunnel/commit/eff17ccbe5|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[Fix] Fix MultiTableSink return committer but sink do not support (#5710)|https://github.com/apache/seatunnel/commit/c413040a6e|2.3.4| |[Fix] Fix log error when multi-table sink close (#5683)|https://github.com/apache/seatunnel/commit/fea4b6f268|2.3.4| |[Feature] Support multi-table sink (#5620)|https://github.com/apache/seatunnel/commit/81ac173189|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve][SeaTunnelSchema] Complete data type prompt. (#4181)|https://github.com/apache/seatunnel/commit/9e92593709|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add Kafka catalog (#4106)|https://github.com/apache/seatunnel/commit/34f1f21e48|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][Connector-V2] Fix ConcurrentModificationException when snapshotState based on SourceReaderBase (#4011)|https://github.com/apache/seatunnel/commit/cd2bd6a408|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[Feature][Connector-V2][AmazonDynamoDB] Add Factory for AmazonDynamoDB (#3348)|https://github.com/apache/seatunnel/commit/a0068efdbf|2.3.0| |[Feature][Connector-V2][SeaTunnelSchema] Improve code structure (#3384)|https://github.com/apache/seatunnel/commit/98b9168d5a|2.3.0| |[feature][connector][common] Add `SingleThreadMultiplexSourceReaderBase (#3335)|https://github.com/apache/seatunnel/commit/f4e33b5912|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2] [Amazondynamodb Connector]add amazondynamodb source & sink connnector (#3166)|https://github.com/apache/seatunnel/commit/183bac02f0|2.3.0| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |Merge remote-tracking branch 'upstream/dev' into st-engine|https://github.com/apache/seatunnel/commit/73a699d47b|2.3.0-beta| |[Imporve][Connector-V2] Imporve iotdb connector (#2917)|https://github.com/apache/seatunnel/commit/3da11ce19b|2.3.0-beta| |Merge remote-tracking branch 'upstream/dev' into st-engine|https://github.com/apache/seatunnel/commit/ca80df779a|2.3.0-beta| |[Connector-V2] [ElasticSearch] Fix ElasticSearch Connector V2 Bug (#2817)|https://github.com/apache/seatunnel/commit/2fcbbf464a|2.2.0-beta| |[Improve][SeaTunnel-Schema] Support parse row type from config file (#2771)|https://github.com/apache/seatunnel/commit/9f59fc1874|2.2.0-beta| |[Bug][Core] Fix the bug that can not convert array and map (#2750)|https://github.com/apache/seatunnel/commit/6db4d7595d|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[hotfix][engine][dag] Loss of parallelism when recreating actions. (#2519)|https://github.com/apache/seatunnel/commit/7953ac149f|2.3.0-beta| |[hotfix] fix user-defined schema for bytes type translattion (#2530)|https://github.com/apache/seatunnel/commit/0491a33edc|2.2.0-beta| |[Imporve][Fake-Connector-V2]support user-defined-schmea and random data for fake-table (#2406)|https://github.com/apache/seatunnel/commit/a5447528c3|2.2.0-beta| |[Feature][Connector-V2] Local file json support (#2465)|https://github.com/apache/seatunnel/commit/65a92f2496|2.2.0-beta| |[Improve][Connector-V2] Http source support user-defined schema (#2439)|https://github.com/apache/seatunnel/commit/793933b6b8|2.2.0-beta| |[Engine][Task] Add task runtime logic (#2386)|https://github.com/apache/seatunnel/commit/14d3b92a54|2.3.0-beta| |[Feature][Connector-V2] Support user-defined schema for source connectors (#2392)|https://github.com/apache/seatunnel/commit/6b650bef07|2.2.0-beta| |Merge from dev to st-engine (#2243)|https://github.com/apache/seatunnel/commit/41e530afd5|2.3.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Improvement][new api] refer to https://github.com/apache/incubator-seatunnel/issues/2127 (#2144)|https://github.com/apache/seatunnel/commit/e19660a049|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-console.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] console sink options (#8743)|https://github.com/apache/seatunnel/commit/c439b99f19|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add UT class name check (#8182)|https://github.com/apache/seatunnel/commit/9cf4192fe4|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Core] Add event notify for all connector (#7501)|https://github.com/apache/seatunnel/commit/d71337b0e9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |Update ConsoleSinkFactory.java (#7350)|https://github.com/apache/seatunnel/commit/921662722f|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[Feature] Support multi-table sink (#5620)|https://github.com/apache/seatunnel/commit/81ac173189|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature] [api env] Add job-level configuration for checkpoint timeout. (#5222)|https://github.com/apache/seatunnel/commit/3c13275ed9|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2]console sink output content to slf4j log (#3745)|https://github.com/apache/seatunnel/commit/82a5c852d8|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][Console] Add Console option rule (#3322)|https://github.com/apache/seatunnel/commit/efb4711600|2.3.0| |[Improve][connector][console] print subtask index (#3000)|https://github.com/apache/seatunnel/commit/de345783d9|2.3.0-beta| |[Bug][Connector-V2] Fix the bug that can not print SeaTunnelRow correctly (#2749)|https://github.com/apache/seatunnel/commit/9365d35200|2.2.0-beta| |[Feature][Connector-V2] Add iceberg source connector (#2615)|https://github.com/apache/seatunnel/commit/ffc6088a79|2.2.0-beta| |[Bug][ConsoleSinkV2]fix fieldToString StackOverflow and add Unit-Test (#2545)|https://github.com/apache/seatunnel/commit/6f87094569|2.2.0-beta| |[Improve][Console] improve console to printf schema and deepToString fields (#2517)|https://github.com/apache/seatunnel/commit/963387d375|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-databend.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector-V2] Support databend source/sink connector (#9331)|https://github.com/apache/seatunnel/commit/2f96f2e46c|2.3.12|
================================================ FILE: docs/en/connectors/changelog/connector-datahub.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector-V2] Make some sink parameters optional for DataHub (#9229)|https://github.com/apache/seatunnel/commit/7418fae10c|2.3.11| |[Feature][Connector-V2] Datahub support multi-table sink (#9212)|https://github.com/apache/seatunnel/commit/7027162dec|2.3.11| |[improve] datahub sink options (#8744)|https://github.com/apache/seatunnel/commit/88f35bd705|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][DataHub] Unified exception for DataHub sink connector & change package name of DataHub (#3446)|https://github.com/apache/seatunnel/commit/395635fa18|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][DataHub] Add DataHub Sink Factory (#3323)|https://github.com/apache/seatunnel/commit/685978d061|2.3.0| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2]Support datahub sink (#2558)|https://github.com/apache/seatunnel/commit/43600a7049|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-dingtalk.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] dingtalk sink options (#8742)|https://github.com/apache/seatunnel/commit/f2145dcc4f|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][DingTalk] Unified exception for dingtalk sink connector (#3678)|https://github.com/apache/seatunnel/commit/0a09562515|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][DingTalk] Add DingTalk Sink Factory (#3324)|https://github.com/apache/seatunnel/commit/56be228ad2|2.3.0| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Add Dingtalk Sink #2257 (#2285)|https://github.com/apache/seatunnel/commit/88a26d5a29|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-doris.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix][Connector-V2] Fix misleading parameter name in DorisStreamLoad (#9685)|https://github.com/apache/seatunnel/commit/16618c8019|2.3.12| |[improve]improve FE node failover logging for better observability (#9657)|https://github.com/apache/seatunnel/commit/ebc9ee3915|2.3.12| |[Feature][Connector-doris] Adds case insensitivity feature (#9306)|https://github.com/apache/seatunnel/commit/9d1cffa5e1|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve][connector-doris] Improved doris source enumerator splits allocation algorithm for subtasks (#9108)|https://github.com/apache/seatunnel/commit/5f55e31c29|2.3.11| |[Improve] doris options (#8745)|https://github.com/apache/seatunnel/commit/268d76cbf3|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[Fix][Doris] Fix catalog not closed (#8415)|https://github.com/apache/seatunnel/commit/2d1db66b9f|2.3.9| |[Feature][Connector-V2[Doris]Support sink ddl (#8250)|https://github.com/apache/seatunnel/commit/ecd8269f2e|2.3.9| |[Feature][Connector-V2]Support Doris Fe Node HA (#8311)|https://github.com/apache/seatunnel/commit/3e86102f47|2.3.9| |[Feature][Core] Support read arrow data (#8137)|https://github.com/apache/seatunnel/commit/4710ea0f8d|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Doris] Support multi-table source read (#7895)|https://github.com/apache/seatunnel/commit/10c37acb34|2.3.9| |[Improve][Connector-V2] Add doris/starrocks create table with comment (#7847)|https://github.com/apache/seatunnel/commit/207b8c16fd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fixbug] doris custom sql work (#7464)|https://github.com/apache/seatunnel/commit/5c6a7c6984|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |Revert "[Fix][Connector-V2] Fix doris primary key order and fields order are inconsistent (#7377)" (#7402)|https://github.com/apache/seatunnel/commit/bb72d91770|2.3.8| |[Fix][Connector-V2] Fix doris primary key order and fields order are inconsistent (#7377)|https://github.com/apache/seatunnel/commit/464da8fb9b|2.3.7| |[Bugfix][Doris-connector] Fix Json serialization, null value causes data error problem|https://github.com/apache/seatunnel/commit/7b19df585f|2.3.7| |[Improve][Connector-V2] Improve doris error msg (#7343)|https://github.com/apache/seatunnel/commit/16950a67cd|2.3.7| |[Fix][Doris] Fix the abnormality of deleting data in CDC scenario. (#7315)|https://github.com/apache/seatunnel/commit/bb2c912404|2.3.7| |fix [Bug] Unable to create a source for identifier 'Iceberg'. #7182 (#7279)|https://github.com/apache/seatunnel/commit/4897491708|2.3.7| |[Fix][Connector-V2] Fix doris TRANSFER_ENCODING header error (#7267)|https://github.com/apache/seatunnel/commit/d886495584|2.3.6| |[Improve][Doris Connector] Unified serialization method,Use RowToJsonConverter and TextSerializationSchema (#7229)|https://github.com/apache/seatunnel/commit/4b3af9bef4|2.3.6| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Zeta] Move SaveMode behavior to master (#6843)|https://github.com/apache/seatunnel/commit/80cf91318d|2.3.6| |[bugFix][Connector-V2][Doris] The multi-FE configuration is supported (#6341)|https://github.com/apache/seatunnel/commit/b6d075194b|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve] Improve doris create table template default value (#6720)|https://github.com/apache/seatunnel/commit/bd64740314|2.3.6| |[Bug Fix] Sink Doris error status(#6753) (#6755)|https://github.com/apache/seatunnel/commit/0ce2c0f220|2.3.6| |[Improve] Improve doris stream load client side error message (#6688)|https://github.com/apache/seatunnel/commit/007a9940e3|2.3.6| |[Fix][Connector-v2] Fix the sql statement error of create table for doris and starrocks (#6679)|https://github.com/apache/seatunnel/commit/88263cd69f|2.3.6| |[Fix][Connector-V2] Fixed doris/starrocks create table sql parse error (#6580)|https://github.com/apache/seatunnel/commit/f2ed1fbde0|2.3.5| |[Fix][Connector-V2] Fix doris sink can not be closed when stream load not read any data (#6570)|https://github.com/apache/seatunnel/commit/341615f488|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Feature] Support nanosecond in Doris DateTimeV2 type (#6358)|https://github.com/apache/seatunnel/commit/76967066bf|2.3.5| |[Fix][Connector-V2] Fix doris source select fields loss primary key information (#6339)|https://github.com/apache/seatunnel/commit/78abe2f202|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Fix] Fix doris stream load failed not reported error (#6315)|https://github.com/apache/seatunnel/commit/a09a5a2bb8|2.3.5| |[Improve][Connector-V2] Doris stream load use FE instead of BE (#6235)|https://github.com/apache/seatunnel/commit/0a7acdce95|2.3.4| |[Feature][Connector-V2][Doris] Add Doris ConnectorV2 Source (#6161)|https://github.com/apache/seatunnel/commit/fc2d80382a|2.3.4| |[Improve] Improve doris sink to random use be (#6132)|https://github.com/apache/seatunnel/commit/869417660e|2.3.4| |[Feature] Support SaveMode on Doris (#6085)|https://github.com/apache/seatunnel/commit/b2375fffe8|2.3.4| |[Improve] Add batch flush in doris sink (#6024)|https://github.com/apache/seatunnel/commit/2c5b48e907|2.3.4| |[Fix] Fix DorisCatalog not implement `name` method (#5988)|https://github.com/apache/seatunnel/commit/d4a323efef|2.3.4| |[Feature][Catalog] Doris Catalog (#5175)|https://github.com/apache/seatunnel/commit/1d3e335d8e|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[Chore] Using try-with-resources to simplify the code. (#4995)|https://github.com/apache/seatunnel/commit/d0aff52425|2.3.4| |[Fix] Fix RestService report NullPointerException (#5319)|https://github.com/apache/seatunnel/commit/5d4b319477|2.3.4| |[feature][doris] Doris factory type (#5061)|https://github.com/apache/seatunnel/commit/d952cea43c|2.3.3| |[Bug][connector-v2][doris] add streamload Content-type for doris URLdecode error (#4880)|https://github.com/apache/seatunnel/commit/1b91816021|2.3.3| |[Bug][Connector-V2][Doris] update last checkpoint id when doing snapshot (#4881)|https://github.com/apache/seatunnel/commit/0360e7e518|2.3.2| |[Improve] Add a jobId to the doris label to distinguish between tasks (#4839)|https://github.com/apache/seatunnel/commit/6672e94077|2.3.2| |[BUG][Doris] Add a jobId to the doris label to distinguish between tasks (#4853)|https://github.com/apache/seatunnel/commit/20ee2faecf|2.3.2| |[Improve][Connector-V2][Doris]Remove serialization code that is no longer used (#4313)|https://github.com/apache/seatunnel/commit/0c0e5f978e|2.3.1| |[Improve][Connector-V2][Doris] Refactor some Doris Sink code as well as support 2pc and cdc (#4235)|https://github.com/apache/seatunnel/commit/7c4005af85|2.3.1| |[Hotfix][Connector][Doris] Fix Content Length header already present (#4277)|https://github.com/apache/seatunnel/commit/df82b77153|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Doris] Change Doris Config Prefix (#3856)|https://github.com/apache/seatunnel/commit/16e39a506b|2.3.1| |[Feature][Connector-V2][Doris] Add Doris StreamLoad sink connector (#3631)|https://github.com/apache/seatunnel/commit/72158be395|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-druid.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] update Druid connector config option (#8594)|https://github.com/apache/seatunnel/commit/07a2288a2e|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Druid]Support multi table for druid sink (#7023)|https://github.com/apache/seatunnel/commit/476d492165|2.3.6| |[Feature][Connector] Add druid sink connector (#6346)|https://github.com/apache/seatunnel/commit/d7fa9afdfe|2.3.6|
================================================ FILE: docs/en/connectors/changelog/connector-easysearch.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] EasySearch support schema_save_mode/data_save_mode (#9310)|https://github.com/apache/seatunnel/commit/3ceb57f279|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] easysearch options (#8951)|https://github.com/apache/seatunnel/commit/349f142962|2.3.10| |[Fix] Fix error log name for SourceSplitEnumerator implements class (#8817)|https://github.com/apache/seatunnel/commit/55ed90ecaf|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Hotfix] Fix compile error (#6463)|https://github.com/apache/seatunnel/commit/943bd48449|2.3.5| |[Improve][Connector-V2] Support INFINI Easysearch (#5933)|https://github.com/apache/seatunnel/commit/41e628840a|2.3.5|
================================================ FILE: docs/en/connectors/changelog/connector-elasticsearch.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Feature][elasticsearch-connector] Add API key authentication support (#9610)|https://github.com/apache/seatunnel/commit/a2bfe1a530|2.3.12| |[Feature][Connectors-V2][Elasticsearch] Support vector transformation sink (#9330)|https://github.com/apache/seatunnel/commit/a1ce97155f|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Feature][connector-elasticsearch] elasticsearch source support PIT (#9150)|https://github.com/apache/seatunnel/commit/948d588d06|2.3.11| |[Bugfix][Elasticsearch] Fix add column event (#9069)|https://github.com/apache/seatunnel/commit/3455316981|2.3.11| |[Feature][elasticsearch-connector] support elasticsearch sql source (#8895)|https://github.com/apache/seatunnel/commit/8140862795|2.3.10| |[Fix] Fix error log name for SourceSplitEnumerator implements class (#8817)|https://github.com/apache/seatunnel/commit/55ed90ecaf|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add Elasticsearch options (#8623)|https://github.com/apache/seatunnel/commit/d307ab44f2|2.3.10| |[Fix][connector-elasticsearch] support elasticsearch nest type && spark with Array<map> (#8492)|https://github.com/apache/seatunnel/commit/92d2a4a106|2.3.10| |Revert "[Feature][connector-elasticsearch] elasticsearch support nested type (#8462)" (#8485)|https://github.com/apache/seatunnel/commit/c68944893a|2.3.9| |[Feature][connector-elasticsearch] elasticsearch support nested type (#8462)|https://github.com/apache/seatunnel/commit/eaa15e4c8d|2.3.9| |[Feature][Elasticsearch] Support sink ddl (#8412)|https://github.com/apache/seatunnel/commit/a4a38ccff2|2.3.9| |[hotfix][connector-elasticsearch-sink] Convert index to lowercase (#8429)|https://github.com/apache/seatunnel/commit/46fcb237c8|2.3.9| |[Improve][Elasticsearch] Truncate the exception message body for request errors (#8263)|https://github.com/apache/seatunnel/commit/b9d850e61c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Feature][Elastic search] Support multi-table source feature (#7502)|https://github.com/apache/seatunnel/commit/29fbeb2547|2.3.8| |[Hotfix][Connector-V2] Fix null not inserted in es (#7493)|https://github.com/apache/seatunnel/commit/a4ba6a171c|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix][Connector-V2][Elasticsearch]Fix sink configuration for DROP_DATA (#7124)|https://github.com/apache/seatunnel/commit/bb9fd516ec|2.3.6| |[Feature][Elasticsearch] Support multi-table sink write #7041 (#7052)|https://github.com/apache/seatunnel/commit/45653e1d22|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Fix][Connector-V2] Remove Some Incorrect Comments and Properties in ElasticsearchCommitInfo|https://github.com/apache/seatunnel/commit/720298775a|2.3.6| |[Bug][Improve][Connector-v2][ElasticsearchSource] Fix behavior when source empty,Support SourceConfig.SOURCE field empty. (#6425)|https://github.com/apache/seatunnel/commit/4e98eb8639|2.3.6| |[Improve][Connector-V2] Add ElasticSearch type converter (#6546)|https://github.com/apache/seatunnel/commit/505c1252bd|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Improve] Implement ElasticSearch connector factory (#6181)|https://github.com/apache/seatunnel/commit/1fd854de67|2.3.4| |[Feature][Connector] add elasticsearch save_mode (#6046)|https://github.com/apache/seatunnel/commit/716a36ac3e|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[BUG][Connector-V2] Fixed conversion exception of elasticsearch array format (#5825)|https://github.com/apache/seatunnel/commit/64f19f25d9|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Chore] Update the es version in the docs. (#4499)|https://github.com/apache/seatunnel/commit/415150635c|2.3.2| |[Improve][ElasticsearchSink]remove useless code. (#4500)|https://github.com/apache/seatunnel/commit/ef44c0d44a|2.3.2| |[Hotfix][Connector-V2][ES] Source deserializer error and inappropriate (#4233)|https://github.com/apache/seatunnel/commit/15530d2785|2.3.2| |[Feature][Connector-V2][ES] Support dsl filter (#4130)|https://github.com/apache/seatunnel/commit/79ca878338|2.3.1| |[Bug][Connector-V2][ES]Fix es field type not support binary(#4240) (#4274)|https://github.com/apache/seatunnel/commit/84f10f2016|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |Shade google common in hadoop (#4222)|https://github.com/apache/seatunnel/commit/5376905075|2.3.1| |Set es text type to string (#4192)|https://github.com/apache/seatunnel/commit/473971b94b|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |Support ES catalog get field mapping (#4167)|https://github.com/apache/seatunnel/commit/72f2418713|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Bug][Connector-V2][ES]Fix es source no data (#4076)|https://github.com/apache/seatunnel/commit/a573b8dbed|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Elasticsearch] Support https protocol (#3997)|https://github.com/apache/seatunnel/commit/79b5cdd9c2|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[hotfix][connector-v2][elasticsearch] Fix bulk refresh operation not locked (#3738)|https://github.com/apache/seatunnel/commit/b6cab90d2f|2.3.0| |[feature][connector-v2][elasticsearch] Support write cdc changelog event in elasticsearch sink (#3673)|https://github.com/apache/seatunnel/commit/3ec47c6848|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][ElasticSearch] Unified exception for ElasticSearch source & sink connector (#3569)|https://github.com/apache/seatunnel/commit/b73944d1dc|2.3.0| |[Improve] [Connector-V2] Bad smell ToArrayCallWithZeroLengthArrayArgument: (#3577)|https://github.com/apache/seatunnel/commit/cc448d98c4|2.3.0| |[Improve][Connector-V2][ElasticSearch] Improve es bulk sink retriable mechanism (#3148)|https://github.com/apache/seatunnel/commit/02ef38eb7a|2.3.0| |[Connector-V2] [E2E] Add missed ElasticSearch E2E module. (#3338)|https://github.com/apache/seatunnel/commit/b2dad4d472|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][Elasticsearch] Support Elasticsearch source (#2821)|https://github.com/apache/seatunnel/commit/ded5481d98|2.3.0| |update (#3149)|https://github.com/apache/seatunnel/commit/59abe4ad62|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Connector-V2] [ElasticSearch] Fix ElasticSearch Connector V2 Bug (#2817)|https://github.com/apache/seatunnel/commit/2fcbbf464a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] new connecotor of Elasticsearch sink(#2326) (#2330)|https://github.com/apache/seatunnel/commit/2a1fd5027f|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-email.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] email connector options (#8983)|https://github.com/apache/seatunnel/commit/7821e824dd|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Transform] Rename sql transform table name from 'fake' to 'dual' (#8298)|https://github.com/apache/seatunnel/commit/e6169684fb|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2]Support multi-table sink feature for email (#7368)|https://github.com/apache/seatunnel/commit/c880b7aa4d|2.3.8| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Email] Unified exception for email connector (#3898)|https://github.com/apache/seatunnel/commit/829261e1a6|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Connector][Email] Add Email Sink Factory (#3326)|https://github.com/apache/seatunnel/commit/0645d11180|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Connector-V2] Add Email sink connector (#2304)|https://github.com/apache/seatunnel/commit/96f2a15e4d|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-fake.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Feature][Connectors-v2] Support auto-increment id for FakeSource (#9505)|https://github.com/apache/seatunnel/commit/3a16b4a4b5|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] fake source options (#8950)|https://github.com/apache/seatunnel/commit/f8c47fb5f4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][API] Support timestamp with timezone offset (#8367)|https://github.com/apache/seatunnel/commit/e18bfeabd2|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Improve][Fake] Improve memory usage when split size is large (#7821)|https://github.com/apache/seatunnel/commit/2d41b024c7|2.3.9| |[Improve][Connector-V2] Time supports default value (#7639)|https://github.com/apache/seatunnel/commit/33978689f5|2.3.8| |[Improve][Connector-V2] Fake supports column configuration (#7503)|https://github.com/apache/seatunnel/commit/39162a4e0b|2.3.8| |[Feature][Core] Add event notify for all connector (#7501)|https://github.com/apache/seatunnel/commit/d71337b0e9|2.3.8| |[Improve][Connector-V2] update vectorType (#7446)|https://github.com/apache/seatunnel/commit/1bba72385b|2.3.8| |[Feature][Connector-V2] Fake Source support produce vector data (#7401)|https://github.com/apache/seatunnel/commit/6937d10ac3|2.3.8| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Fix][FakeSource] fix random from template not include the latest value issue (#6438)|https://github.com/apache/seatunnel/commit/6ec16ac46f|2.3.5| |[Improve][Catalog] Use default tablepath when can not get the tablepath from source config (#6276)|https://github.com/apache/seatunnel/commit/f8158bb805|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |FakeSource support generate different CatalogTable for MultipleTable (#5766)|https://github.com/apache/seatunnel/commit/a8b93805ea|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |Support config tableIdentifier for schema (#5628)|https://github.com/apache/seatunnel/commit/652921fb75|2.3.4| |[Feature] Add `table-names` from FakeSource/Assert to produce/assert multi-table (#5604)|https://github.com/apache/seatunnel/commit/2c67cd8f3e|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-fake] Optimizing Data Generation Strategies refer to #4004 (#4061)|https://github.com/apache/seatunnel/commit/c7c596a6dc|2.3.1| |[Improve][Connector-V2][Fake] Improve fake connector (#3932)|https://github.com/apache/seatunnel/commit/31f12431d9|2.3.1| |[Feature][Connector-v2][StarRocks] Support write cdc changelog event(INSERT/UPDATE/DELETE) (#3865)|https://github.com/apache/seatunnel/commit/8e3d158c03|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Fake] Unified exception for fake source connector (#3520)|https://github.com/apache/seatunnel/commit/f371ad5825|2.3.0| |[Connector-V2] [Fake] Add Fake TableSourceFactory (#3345)|https://github.com/apache/seatunnel/commit/74b61c33a0|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve] [Engine] Improve Engine performance. (#3216)|https://github.com/apache/seatunnel/commit/7393c47327|2.3.0| |[hotfix][connector][fake] fix FakeSourceSplitEnumerator assigning duplicate splits when restoring (#3112)|https://github.com/apache/seatunnel/commit/98b1feda85|2.3.0-beta| |[improve][connector][fake] supports setting the number of split rows and reading interval (#3098)|https://github.com/apache/seatunnel/commit/efabe6af7f|2.3.0-beta| |[feature][connector][fake] Support mutil splits for fake source connector (#2974)|https://github.com/apache/seatunnel/commit/c28c44b7c9|2.3.0-beta| |[E2E][ST-Engine] Add test data consistency in 3 node cluster and fix bug (#3038)|https://github.com/apache/seatunnel/commit/97400a6f13|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2] Improve fake source connector (#2944)|https://github.com/apache/seatunnel/commit/044f62ef32|2.3.0-beta| |[Improve][Connector-v2-Fake]Supports direct definition of data values(row) (#2839)|https://github.com/apache/seatunnel/commit/b7d9dde6c8|2.3.0-beta| |[Connector-V2] [ElasticSearch] Fix ElasticSearch Connector V2 Bug (#2817)|https://github.com/apache/seatunnel/commit/2fcbbf464a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Bug] [connector-fake] Fake date calculation error(#2573)|https://github.com/apache/seatunnel/commit/9ea01298f1|2.2.0-beta| |[Bug][ConsoleSinkV2]fix fieldToString StackOverflow and add Unit-Test (#2545)|https://github.com/apache/seatunnel/commit/6f87094569|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Imporve][Fake-Connector-V2]support user-defined-schmea and random data for fake-table (#2406)|https://github.com/apache/seatunnel/commit/a5447528c3|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-base-hadoop.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-File] Fix parquet support user config schema (#9596)|https://github.com/apache/seatunnel/commit/2bdaeb6a07|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Fix][Connector-V2][connector-file-base-hadoop] Fixed HdfsFile source load the krb5_path configuration (#7870)|https://github.com/apache/seatunnel/commit/cd9836bced|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Improve][Connector-V2][HDFS] Support setting hdfs-site.xml (#3778)|https://github.com/apache/seatunnel/commit/c8d59ecac1|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-base.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support multimodal embeddings (#9673)|https://github.com/apache/seatunnel/commit/12414c4eab| dev | |[Improve][Connector-V2] File Source Support filtering files by last modified time. (#9526)|https://github.com/apache/seatunnel/commit/cde4c3d410|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature] [connector-file] Add configurable sheet_max_rows support for Excel sink connector (#9668)|https://github.com/apache/seatunnel/commit/ea5bc51067|2.3.12| |[Improve][Csv] support configurable CSV delimiter in file connector (#9660)|https://github.com/apache/seatunnel/commit/48fb7ef697|2.3.12| |[Fix][Connector-V2] Update file filter pattern compilation to remove unnecessary quoting (#9658)|https://github.com/apache/seatunnel/commit/b5c7b4ad0e|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Fix][Connector-File] Fix parquet support user config schema (#9596)|https://github.com/apache/seatunnel/commit/2bdaeb6a07|2.3.12| |[Improve][Connector-file] Add configurable binary chunk size support to BinaryReadStrategy (#9391)|https://github.com/apache/seatunnel/commit/38e87e75a3|2.3.12| |[Feature][Sink] File support new format: maxwell_json,canal_json,debezium_json (#9278) (#9336)|https://github.com/apache/seatunnel/commit/a1bfbb20dd|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Feature][connector-hive] hive sink connector support overwrite mode #7843 (#7891)|https://github.com/apache/seatunnel/commit/6fafe6f4d3|2.3.12| |[Fix][connector-file-base] fix parquet int32 convert error (#9142)|https://github.com/apache/seatunnel/commit/e6413c388e|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Bugfix][Csv] Fix csv format delimiter (#9066)|https://github.com/apache/seatunnel/commit/ff5fc129b8|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Feature][File] Support extract CSV files with different columns in different order (#9064)|https://github.com/apache/seatunnel/commit/74db1cbaac|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Fix][File]use common-csv to read csv file (#8919)|https://github.com/apache/seatunnel/commit/3e64a42838|2.3.10| |[Improve][connector-file-base] Improved multiple table file source allocation algorithm for subtasks (#8878)|https://github.com/apache/seatunnel/commit/44a12cc55c|2.3.10| |[Fix][Connector-File] Fix conflicting `file_format_type` requirement (#8823)|https://github.com/apache/seatunnel/commit/6e0d630f7c|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve][Connector-V2] Improve orc read error message (#8751)|https://github.com/apache/seatunnel/commit/d66d9dc9ce|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Connector-V2] User selects csv string pattern (#8572)|https://github.com/apache/seatunnel/commit/227a11f5aa|2.3.10| |[Fix][Connector-V2] Fix CSV String type write type (#8499)|https://github.com/apache/seatunnel/commit/9268f5a255|2.3.10| |[Fix][File] Fix Multi-file with binary format synchronization failed (#8546)|https://github.com/apache/seatunnel/commit/6e4ee468a5|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-file-base] Improved file allocation algorithm for subtasks. (#8453)|https://github.com/apache/seatunnel/commit/d61cba233e|2.3.9| |[Bug] [connector-file] When the data source field is less than the target (Hive) field,it will throw null pointer exception#8150 (#8200)|https://github.com/apache/seatunnel/commit/25b8a02b76|2.3.9| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Improve][Connector][Hive] skip temporary hidden directories (#8402)|https://github.com/apache/seatunnel/commit/9fdedc487e|2.3.9| |[Feature][Connector-V2] Support use EasyExcel as read excel engine (#8064)|https://github.com/apache/seatunnel/commit/b8e1177fcb|2.3.9| |[BugFix][Excel] Fix read formulas/number cell value of excel (#8316)|https://github.com/apache/seatunnel/commit/00c5aed1af|2.3.9| |[Improve][Transform] gz support excel (#8181)|https://github.com/apache/seatunnel/commit/c3ae726ee0|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Excel] Support read blank string & auto type-cast (#8111)|https://github.com/apache/seatunnel/commit/3a54f1253f|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Connectors] LocalFile Support reading gz (#8025)|https://github.com/apache/seatunnel/commit/337aa50f08|2.3.9| |[Fix][Connector-V2] Fix file binary format sync convert directory to file (#7942)|https://github.com/apache/seatunnel/commit/86ae9272c4|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Bug] [connectors-v2] The Hadoop Source/Sink fails with Unable to find valid Kerberos Ticket. (#7809)|https://github.com/apache/seatunnel/commit/a8bdea24cc|2.3.9| |[Fix][Connector-V2] Fix When reading Excel data, string and date type conversion errors (#7796)|https://github.com/apache/seatunnel/commit/749b2fe364|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve] Refactor S3FileCatalog and it's factory (#7457)|https://github.com/apache/seatunnel/commit/d928e8b113|2.3.8| |[Feature][Connector-V2][Iceberg] Support Iceberg Kerberos (#7246)|https://github.com/apache/seatunnel/commit/e3001207c8|2.3.8| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[feature][connector-file-local] add save mode function for localfile (#7080)|https://github.com/apache/seatunnel/commit/7b2f538310|2.3.6| |[Hotfix][Hive Connector] Fix Hive hdfs-site.xml and hive-site.xml not be load error (#7069)|https://github.com/apache/seatunnel/commit/c23a577f34|2.3.6| |[Feature][Connector-V2] Add Huawei Cloud OBS connector (#4578)|https://github.com/apache/seatunnel/commit/d266f4db64|2.3.6| |[Improve][File Connector]Improve xml read code & fix can not use true for a boolean option (#6930)|https://github.com/apache/seatunnel/commit/c13a563994|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[Improve] Improve read with parquet type convert error (#6683)|https://github.com/apache/seatunnel/commit/6c65805699|2.3.5| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Bug] Fix OrcWriteStrategy/ParquetWriteStrategy doesn't login with kerberos (#6472)|https://github.com/apache/seatunnel/commit/24441c876d|2.3.5| |[Bug] [formats] Fix fail to parse line when content contains the file delimiter (#6589)|https://github.com/apache/seatunnel/commit/17e29185fa|2.3.5| |[Improve][Connector-V2] Support read orc with schema config to cast type (#6531)|https://github.com/apache/seatunnel/commit/d1599f8ad9|2.3.5| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Feature][Connectors-V2][File]support assign encoding for file source/sink (#6489)|https://github.com/apache/seatunnel/commit/d159fbe086|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |Fix HiveMetaStoreProxy#enableKerberos will return true if doesn't enable kerberos (#6307)|https://github.com/apache/seatunnel/commit/1dad6f7061|2.3.4| |[Feature][Connector]add s3file save mode function (#6131)|https://github.com/apache/seatunnel/commit/81c51073bf|2.3.4| |[bugfix][file-execl] Fix the Issue of Abnormal Data Reading from Excel Files (#5932)|https://github.com/apache/seatunnel/commit/6a2b05a845|2.3.4| |[Feature][Connectors-v2-file-ftp] FTP source/sink add ftp connection mode (#6077) (#6099)|https://github.com/apache/seatunnel/commit/f6bcc4d59d|2.3.4| |Disable HDFSFileSystem cache (#6039)|https://github.com/apache/seatunnel/commit/135c91818e|2.3.4| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Feature][Connector-V2] Support read .xls excel file (#6066)|https://github.com/apache/seatunnel/commit/43787a3dde|2.3.4| |Add multiple table file sink to base (#6049)|https://github.com/apache/seatunnel/commit/085e0e5fc3|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve][File] Clean memory buffer of `JsonWriteStrategy` & `ExcelWriteStrategy` (#5925)|https://github.com/apache/seatunnel/commit/7297a4c95c|2.3.4| |[Bug][Connector][FileBase]Parquet reader parsing array type exception. (#4457)|https://github.com/apache/seatunnel/commit/5c6b11329c|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Feature] LocalFileSource support multiple table|https://github.com/apache/seatunnel/commit/72be6663ad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Improve][LocalFile] parquet use system timezone (#5605)|https://github.com/apache/seatunnel/commit/b3e13513ac|2.3.4| |[Bugfix][Connector-v2] fix file sink `isPartitionFieldWriteInFile` occurred exception when no columns are given (#5508)|https://github.com/apache/seatunnel/commit/9fb5499295|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |[Hotfix][File-Connector] Fix WriteStrategy parallel writing thread unsafe issue (#5546)|https://github.com/apache/seatunnel/commit/1177d02d55|2.3.4| |[Feature] [File Connector] Supports writing column names when the output type is file (CSV) (#5459)|https://github.com/apache/seatunnel/commit/f73b37291e|2.3.4| |Revert "[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)" (#5487)|https://github.com/apache/seatunnel/commit/093901068e|2.3.4| |[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)|https://github.com/apache/seatunnel/commit/de7b86a5dd|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[bugfix] [File Base] Fix Hadoop Kerberos authentication related issues. (#5171)|https://github.com/apache/seatunnel/commit/2a85525f4c|2.3.3| |[Feature][Connector-V2][File] Add cos source&sink (#4979)|https://github.com/apache/seatunnel/commit/1f94676436|2.3.3| |[Improve][Connector[File] Optimize files commit order (#5045)|https://github.com/apache/seatunnel/commit/1e18a8c530|2.3.3| |[Feature][E2E][FtpFile] add ftp file e2e test case (#4647)|https://github.com/apache/seatunnel/commit/b1b1f5e7e0|2.3.3| |[Bugfix] [Connector-V2] [File] Fix read temp file (#4876)|https://github.com/apache/seatunnel/commit/5e03d22d6c|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Hive] Support assign partitions (#3842)|https://github.com/apache/seatunnel/commit/6a4a850b4c|2.3.1| |[Bug][Connectors] Text And Json WriteStrategy lost the sinkColumnsIndexInRow (#3863)|https://github.com/apache/seatunnel/commit/7b5f6f1bc2|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector-V2][File] Allow the user to set the row delimiter as an empty string (#3854)|https://github.com/apache/seatunnel/commit/84508fcb65|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Feature][Connector-V2][File] Support skip number when reading text csv files (#3900)|https://github.com/apache/seatunnel/commit/243b6a6b23|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Improve][Connector-V2][File] File Connector add lzo compression way. (#3782)|https://github.com/apache/seatunnel/commit/8875d02589|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |fix file source connector option rule bug (#3804)|https://github.com/apache/seatunnel/commit/cab42f6eb1|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Improve][Connector-V2][HDFS] Support setting hdfs-site.xml (#3778)|https://github.com/apache/seatunnel/commit/c8d59ecac1|2.3.0| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Improve] [Connector-V2] Fix Kafka sink can't run EXACTLY_ONCE semantics (#3724)|https://github.com/apache/seatunnel/commit/5e3f196e29|2.3.0| |[Connector-V2] [File] Fix bug data file name will duplicate when use SeaTunnel Engine (#3717)|https://github.com/apache/seatunnel/commit/c96c53004f|2.3.0| |[Hotfix][Connector-V2][File] Fix file sink connector npe (#3706)|https://github.com/apache/seatunnel/commit/a662a88fdc|2.3.0| |[Feature][Connector-V2][Oss jindo] Add oss jindo source & sink connector (#3456)|https://github.com/apache/seatunnel/commit/2507372311|2.3.0| |[Improve][Connector-V2][File] Support split file based on batch size (#3625)|https://github.com/apache/seatunnel/commit/f39e3a531d|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Hotfix][Connector-V2][Hive] Fix npe of getting file system (#3506)|https://github.com/apache/seatunnel/commit/e1fc3d1b01|2.3.0| |[Improve][core-v1][seatunnel-core-base] remove seatunnel-core-base (#3480)|https://github.com/apache/seatunnel/commit/d6e6a02a36|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Hotfix][Connector-V2][Hive] Fix the bug that when write data to hive throws NullPointerException (#3258)|https://github.com/apache/seatunnel/commit/777bf6b42e|2.3.0| |[Bug]add 3node worker done test and fix some bug (#3115)|https://github.com/apache/seatunnel/commit/bc852a4dff|2.3.0| |[Feature][Connector-V2][SFTP] Add SFTP file source & sink connector (#3006)|https://github.com/apache/seatunnel/commit/9e496383b8|2.3.0| |[Feature][Connector-V2][S3] Add S3 file source & sink connector (#3119)|https://github.com/apache/seatunnel/commit/f27d68ca9c|2.3.0-beta| |[Feature][Connector-V2][File] Fix filesystem get error (#3117)|https://github.com/apache/seatunnel/commit/7404c180de|2.3.0-beta| |[Improve][Connector-v2][file] Reuse array type container when read row data (#3123)|https://github.com/apache/seatunnel/commit/da0646ac6d|2.3.0-beta| |[Hotfix][Connector-V2][File] Fix ParquetReadStrategy get NPE (#3122)|https://github.com/apache/seatunnel/commit/ba99de08c8|2.3.0-beta| |[hotfix][engine] Add master node switch test and fix bug (#3082)|https://github.com/apache/seatunnel/commit/608be51bc4|2.3.0-beta| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[hotfix][connector][file] Solved the bug of can not parse '\t' as delimiter from config file (#3083)|https://github.com/apache/seatunnel/commit/bfde596754|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Improve][Connector-V2] Improve text write (#2971)|https://github.com/apache/seatunnel/commit/0ecd7906c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Bug][Connector-V2][File] Fix the bug of incorrect path in windows environment (#2980)|https://github.com/apache/seatunnel/commit/2e16161865|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][connector-file-base] Fix source split assigning reader to negative number (#2921)|https://github.com/apache/seatunnel/commit/0b5a2852fb|2.3.0-beta| |[Improve][Connector-V2] Improve orc write strategy to support all data types (#2860)|https://github.com/apache/seatunnel/commit/4d048cc23e|2.3.0-beta| |[Fix] [Connector-V2-File] Fix file connector bug (#2858)|https://github.com/apache/seatunnel/commit/e0459bbab6|2.2.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Imporve][Connector-V2] Refactor ftp sink & Add ftp file source (#2774)|https://github.com/apache/seatunnel/commit/4aacbcdd1f|2.2.0-beta| |[Bug] [Connector-V2] Fix hive source connector parallelism not work (#2823)|https://github.com/apache/seatunnel/commit/9f21d4c769|2.2.0-beta| |[Improve][Connector-V2] Imporve orc read strategy (#2747)|https://github.com/apache/seatunnel/commit/af34beda37|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[Feature][Connector-V2] Add oss sink (#2629)|https://github.com/apache/seatunnel/commit/bb2ad40487|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Improve][Connector-V2] Refactor the structure of file sink to reduce redundant codes (#2555)|https://github.com/apache/seatunnel/commit/6315092930|2.2.0-beta| |[Feature][Connector-V2] Add oss source connector (#2467)|https://github.com/apache/seatunnel/commit/712b77744e|2.2.0-beta| |[Feature][File connector] Support ftp file sink (#2483)|https://github.com/apache/seatunnel/commit/a87e5de80a|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file json support (#2451)|https://github.com/apache/seatunnel/commit/84f6b17c15|2.2.0-beta| |[Feature][Connector-V2] Add base source connector code for connector-file-base (#2399)|https://github.com/apache/seatunnel/commit/1829ddc662|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of local file connector (#2403)|https://github.com/apache/seatunnel/commit/a538daed5c|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that file connector release resources multi times (#2379)|https://github.com/apache/seatunnel/commit/58c64aab2a|2.2.0-beta| |[Improve][Connector-V2] Optimize the code structure (#2380)|https://github.com/apache/seatunnel/commit/7376ec7ab1|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |Replace plain string with constants (#2308)|https://github.com/apache/seatunnel/commit/3c0415e56e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-cos.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Feature][Tool] Add connector check script for issue 6199 (#6635)|https://github.com/apache/seatunnel/commit/65aedf6a79|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Feature][Connector-V2][File] Add cos source&sink (#4979)|https://github.com/apache/seatunnel/commit/1f94676436|2.3.3|
================================================ FILE: docs/en/connectors/changelog/connector-file-ftp.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Improve][Connector-V2] Add remote host verification option for FTP data channels (#9324)|https://github.com/apache/seatunnel/commit/019d69d10a|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Improve][Connector-V2] Ensure that the FTP connector behaves reliably during directory operation (#8959)|https://github.com/apache/seatunnel/commit/b5f0b43fcb|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Fix][Connector-V2][FTP] Fix FTP connector connection_mode is not effective (#7865)|https://github.com/apache/seatunnel/commit/26c528a5ed|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2]Ftp file source support multiple table (#7795)|https://github.com/apache/seatunnel/commit/22fe27a3d6|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Feature][Connector-V2] Ftp file sink suport multiple table and save mode (#7665)|https://github.com/apache/seatunnel/commit/4f812e12ae|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][Connectors-v2-file-ftp] FTP source/sink add ftp connection mode (#6077) (#6099)|https://github.com/apache/seatunnel/commit/f6bcc4d59d|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Core] [Improve] Fix some sonar check error (#3240)|https://github.com/apache/seatunnel/commit/8664bb53a5|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Imporve][Connector-V2] Refactor ftp sink & Add ftp file source (#2774)|https://github.com/apache/seatunnel/commit/4aacbcdd1f|2.2.0-beta| |[Feature][File connector] Support ftp file sink (#2483)|https://github.com/apache/seatunnel/commit/a87e5de80a|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-hadoop.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Feature][Connector-V2] Support hdfs file multi table source read (#9816)|https://github.com/apache/seatunnel/commit/672af255ef| dev | |[Feature][Connector-File-Hadoop]Support multi table sink feature for HdfsFile (#9651)|https://github.com/apache/seatunnel/commit/bb4f743c05|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Improve][Connector-V2] Refactor hdfs file sink connector code structure (#2701)|https://github.com/apache/seatunnel/commit/6129c02567|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file json support (#2451)|https://github.com/apache/seatunnel/commit/84f6b17c15|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file source connector (#2420)|https://github.com/apache/seatunnel/commit/4fb6f2a216|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-jindo-oss.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Bugfix][jindo] Remove useless code (#5540)|https://github.com/apache/seatunnel/commit/b889618379|2.3.4| |[bugfix][CI]remove jindo dependencies|https://github.com/apache/seatunnel/commit/38e1e30e20|2.3.4| |[Feature][Connector-V2][Oss jindo] Fix the problem of jindo driver download failure. (#5511)|https://github.com/apache/seatunnel/commit/a14d9c0d08|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Improve][Connector-V2][OSS-Jindo] Optimize jindo oss connector (#4964)|https://github.com/apache/seatunnel/commit/5fbfd05061|2.3.3|
================================================ FILE: docs/en/connectors/changelog/connector-file-local.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] File Source Support filtering files by last modified time. (#9526)|https://github.com/apache/seatunnel/commit/cde4c3d410|2.3.12| |[Feature][Format] Improve maxwell_json,canal_json,debezium_json format add ts_ms and table (#9701)|https://github.com/apache/seatunnel/commit/fb8444b946|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Feature][Sink] File support new format: maxwell_json,canal_json,debezium_json (#9278) (#9336)|https://github.com/apache/seatunnel/commit/a1bfbb20dd|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[feature][connector-file-local] add save mode function for localfile (#7080)|https://github.com/apache/seatunnel/commit/7b2f538310|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Feature][Connectors-V2][File]support assign encoding for file source/sink (#6489)|https://github.com/apache/seatunnel/commit/d159fbe086|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |Add multiple table file sink to base (#6049)|https://github.com/apache/seatunnel/commit/085e0e5fc3|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature] LocalFile sink support multiple table (#5931)|https://github.com/apache/seatunnel/commit/0fdf45f94d|2.3.4| |[Feature] LocalFileSource support multiple table|https://github.com/apache/seatunnel/commit/72be6663ad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Bug][Connector-V2] Fix error option (#2775)|https://github.com/apache/seatunnel/commit/488e561eef|2.2.0-beta| |[Improve][Connector-V2] Refactor local file sink connector code structure (#2655)|https://github.com/apache/seatunnel/commit/6befd599a1|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Local file json support (#2465)|https://github.com/apache/seatunnel/commit/65a92f2496|2.2.0-beta| |[Feature][Connector-V2] Add local file connector source (#2419)|https://github.com/apache/seatunnel/commit/eff595c452|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of local file connector (#2403)|https://github.com/apache/seatunnel/commit/a538daed5c|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-obs.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Add Huawei Cloud OBS connector (#4578)|https://github.com/apache/seatunnel/commit/d266f4db64|2.3.6|
================================================ FILE: docs/en/connectors/changelog/connector-file-oss-jindo.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev || --- | --- | --- | |[Improve][Connector-V2][OSS-Jindo] Optimize jindo oss connector (#4964)|https://github.com/apache/seatunnel/commit/5fbfd05061|2.3.3| |[Fix][Connector-V2] Fix file-oss config check bug and amend file-oss-jindo factoryIdentifier (#4581)|https://github.com/apache/seatunnel/commit/5c4f17df20|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Hotfix][OssFile Connector]fix ossfile bug (#3684)|https://github.com/apache/seatunnel/commit/ba6259274d|2.3.0| |[Feature][Connector-V2][Oss jindo] Add oss jindo source & sink connector (#3456)|https://github.com/apache/seatunnel/commit/2507372311|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-file-oss.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Doc][Connector-V2] Update save mode config for OssFileSink (#9303)|https://github.com/apache/seatunnel/commit/40097d7f3e|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve] Added OSSFileCatalog and it's factory (#7458)|https://github.com/apache/seatunnel/commit/9006a205db|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Hotfix][Oss File Connector] fix oss connector can not run bug (#6010)|https://github.com/apache/seatunnel/commit/755bc2a730|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Fix][Connector-V2] Fix file-oss config check bug and amend file-oss-jindo factoryIdentifier (#4581)|https://github.com/apache/seatunnel/commit/5c4f17df20|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Feature][Connector-V2] Add oss sink (#2629)|https://github.com/apache/seatunnel/commit/bb2ad40487|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add oss source connector (#2467)|https://github.com/apache/seatunnel/commit/712b77744e|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-s3.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Fix][Connector-V2] Fixed incorrectly setting s3 key in some cases (#8885)|https://github.com/apache/seatunnel/commit/cf4bab5be2|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| | [improve] update S3File connector config option (#8615)|https://github.com/apache/seatunnel/commit/80cc9fa6ff|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Hotfix][Zeta] Fix the dependency conflict between the guava in hadoop-aws and hive-exec (#7986)|https://github.com/apache/seatunnel/commit/a7837f1f19|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve] Refactor S3FileCatalog and it's factory (#7457)|https://github.com/apache/seatunnel/commit/d928e8b113|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[bigfix][S3 File]:Change the [SCHEMA] attribute of the [S3CONF class] to be non-static to avoid being reassigned after deserialization (#6717)|https://github.com/apache/seatunnel/commit/79bb70101a|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Feature][Connector]add s3file save mode function (#6131)|https://github.com/apache/seatunnel/commit/81c51073bf|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[Chore] Upgrade guava to 27.0-jre (#4238)|https://github.com/apache/seatunnel/commit/4851bee575|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add S3Catalog (#4121)|https://github.com/apache/seatunnel/commit/7d7f506547|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Engine][Checkpoint]Unified naming style (#3714)|https://github.com/apache/seatunnel/commit/bc0bd3bec3|2.3.0| |[Connector][File-S3]Set AK is not required (#3713)|https://github.com/apache/seatunnel/commit/da3c526172|2.3.0| |[Connector&Engine]Set S3 AK to optional (#3688)|https://github.com/apache/seatunnel/commit/4710918b02|2.3.0| |[Connector][S3]Support s3a protocol (#3632)|https://github.com/apache/seatunnel/commit/ae4cc9c1ec|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][S3] Add S3 file source & sink connector (#3119)|https://github.com/apache/seatunnel/commit/f27d68ca9c|2.3.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-file-sftp.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Hotfix][Connector-V2][SFTP] Add quote to sftp file names with wildcard characters (#8501)|https://github.com/apache/seatunnel/commit/c5751b001b|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Connector-V2]Sftp file source support multiple table (#7824)|https://github.com/apache/seatunnel/commit/cfb8760f58|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] sftp file sink suport multiple table and save mode (#7668)|https://github.com/apache/seatunnel/commit/dc4b9898f7|2.3.8| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[BugFix][Connector-file-sftp] Fix SFTPInputStream.close does not correctly trigger the closing of the file stream (#6323) (#6329)|https://github.com/apache/seatunnel/commit/eee881af91|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Bug Fix] [seatunnel-connectors-v2][SFTP] Fix incorrect exception handling logic (#4720)|https://github.com/apache/seatunnel/commit/dc350e67c3|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][SFTP] Add SFTP file source & sink connector (#3006)|https://github.com/apache/seatunnel/commit/9e496383b8|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-file.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector-V2] Support hdfs file multi table source read (#9816)|https://github.com/apache/seatunnel/commit/672af255ef| dev | |[Feature][Transform-V2] Support multimodal embeddings (#9673)|https://github.com/apache/seatunnel/commit/12414c4eab| dev | |[Improve][Connector-V2] File Source Support filtering files by last modified time. (#9526)|https://github.com/apache/seatunnel/commit/cde4c3d410|2.3.12| |[Feature][Format] Improve maxwell_json,canal_json,debezium_json format add ts_ms and table (#9701)|https://github.com/apache/seatunnel/commit/fb8444b946|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature] [connector-file] Add configurable sheet_max_rows support for Excel sink connector (#9668)|https://github.com/apache/seatunnel/commit/ea5bc51067|2.3.12| |[Feature][Connector-File-Hadoop]Support multi table sink feature for HdfsFile (#9651)|https://github.com/apache/seatunnel/commit/bb4f743c05|2.3.12| |[Improve][Csv] support configurable CSV delimiter in file connector (#9660)|https://github.com/apache/seatunnel/commit/48fb7ef697|2.3.12| |[Fix][Connector-V2] Update file filter pattern compilation to remove unnecessary quoting (#9658)|https://github.com/apache/seatunnel/commit/b5c7b4ad0e|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Fix][Connector-File] Fix parquet support user config schema (#9596)|https://github.com/apache/seatunnel/commit/2bdaeb6a07|2.3.12| |[Improve][Connector-file] Add configurable binary chunk size support to BinaryReadStrategy (#9391)|https://github.com/apache/seatunnel/commit/38e87e75a3|2.3.12| |[Feature][Sink] File support new format: maxwell_json,canal_json,debezium_json (#9278) (#9336)|https://github.com/apache/seatunnel/commit/a1bfbb20dd|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Feature][connector-hive] hive sink connector support overwrite mode #7843 (#7891)|https://github.com/apache/seatunnel/commit/6fafe6f4d3|2.3.12| |[Improve][Connector-V2] Add remote host verification option for FTP data channels (#9324)|https://github.com/apache/seatunnel/commit/019d69d10a|2.3.11| |[Doc][Connector-V2] Update save mode config for OssFileSink (#9303)|https://github.com/apache/seatunnel/commit/40097d7f3e|2.3.11| |[Fix][connector-file-base] fix parquet int32 convert error (#9142)|https://github.com/apache/seatunnel/commit/e6413c388e|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Bugfix][Csv] Fix csv format delimiter (#9066)|https://github.com/apache/seatunnel/commit/ff5fc129b8|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Feature][File] Support extract CSV files with different columns in different order (#9064)|https://github.com/apache/seatunnel/commit/74db1cbaac|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Fix][File]use common-csv to read csv file (#8919)|https://github.com/apache/seatunnel/commit/3e64a42838|2.3.10| |[Improve][Connector-V2] Ensure that the FTP connector behaves reliably during directory operation (#8959)|https://github.com/apache/seatunnel/commit/b5f0b43fcb|2.3.10| |[Improve][connector-file-base] Improved multiple table file source allocation algorithm for subtasks (#8878)|https://github.com/apache/seatunnel/commit/44a12cc55c|2.3.10| |[Fix][Connector-V2] Fixed incorrectly setting s3 key in some cases (#8885)|https://github.com/apache/seatunnel/commit/cf4bab5be2|2.3.10| |[Fix][Connector-File] Fix conflicting `file_format_type` requirement (#8823)|https://github.com/apache/seatunnel/commit/6e0d630f7c|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve][Connector-V2] Improve orc read error message (#8751)|https://github.com/apache/seatunnel/commit/d66d9dc9ce|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| | [improve] update S3File connector config option (#8615)|https://github.com/apache/seatunnel/commit/80cc9fa6ff|2.3.10| |[Fix][Connector-V2] User selects csv string pattern (#8572)|https://github.com/apache/seatunnel/commit/227a11f5aa|2.3.10| |[Fix][Connector-V2] Fix CSV String type write type (#8499)|https://github.com/apache/seatunnel/commit/9268f5a255|2.3.10| |[Hotfix][Connector-V2][SFTP] Add quote to sftp file names with wildcard characters (#8501)|https://github.com/apache/seatunnel/commit/c5751b001b|2.3.10| |[Fix][File] Fix Multi-file with binary format synchronization failed (#8546)|https://github.com/apache/seatunnel/commit/6e4ee468a5|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-file-base] Improved file allocation algorithm for subtasks. (#8453)|https://github.com/apache/seatunnel/commit/d61cba233e|2.3.9| |[Bug] [connector-file] When the data source field is less than the target (Hive) field,it will throw null pointer exception#8150 (#8200)|https://github.com/apache/seatunnel/commit/25b8a02b76|2.3.9| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Improve][Connector][Hive] skip temporary hidden directories (#8402)|https://github.com/apache/seatunnel/commit/9fdedc487e|2.3.9| |[Feature][Connector-V2] Support use EasyExcel as read excel engine (#8064)|https://github.com/apache/seatunnel/commit/b8e1177fcb|2.3.9| |[BugFix][Excel] Fix read formulas/number cell value of excel (#8316)|https://github.com/apache/seatunnel/commit/00c5aed1af|2.3.9| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Improve][Transform] gz support excel (#8181)|https://github.com/apache/seatunnel/commit/c3ae726ee0|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Excel] Support read blank string & auto type-cast (#8111)|https://github.com/apache/seatunnel/commit/3a54f1253f|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Connectors] LocalFile Support reading gz (#8025)|https://github.com/apache/seatunnel/commit/337aa50f08|2.3.9| |[Hotfix][Zeta] Fix the dependency conflict between the guava in hadoop-aws and hive-exec (#7986)|https://github.com/apache/seatunnel/commit/a7837f1f19|2.3.9| |[Fix][Connector-V2] Fix file binary format sync convert directory to file (#7942)|https://github.com/apache/seatunnel/commit/86ae9272c4|2.3.9| |[Fix][Connector-V2][FTP] Fix FTP connector connection_mode is not effective (#7865)|https://github.com/apache/seatunnel/commit/26c528a5ed|2.3.9| |[Fix][Connector-V2][connector-file-base-hadoop] Fixed HdfsFile source load the krb5_path configuration (#7870)|https://github.com/apache/seatunnel/commit/cd9836bced|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Connector-V2]Sftp file source support multiple table (#7824)|https://github.com/apache/seatunnel/commit/cfb8760f58|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Bug] [connectors-v2] The Hadoop Source/Sink fails with Unable to find valid Kerberos Ticket. (#7809)|https://github.com/apache/seatunnel/commit/a8bdea24cc|2.3.9| |[Fix][Connector-V2] Fix When reading Excel data, string and date type conversion errors (#7796)|https://github.com/apache/seatunnel/commit/749b2fe364|2.3.9| |[Feature][Connector-V2]Ftp file source support multiple table (#7795)|https://github.com/apache/seatunnel/commit/22fe27a3d6|2.3.9| |[Feature][Connector-V2] sftp file sink suport multiple table and save mode (#7668)|https://github.com/apache/seatunnel/commit/dc4b9898f7|2.3.8| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Feature][Connector-V2] Ftp file sink suport multiple table and save mode (#7665)|https://github.com/apache/seatunnel/commit/4f812e12ae|2.3.8| |[Improve] Refactor S3FileCatalog and it's factory (#7457)|https://github.com/apache/seatunnel/commit/d928e8b113|2.3.8| |[Improve] Added OSSFileCatalog and it's factory (#7458)|https://github.com/apache/seatunnel/commit/9006a205db|2.3.8| |[Feature][Connector-V2][Iceberg] Support Iceberg Kerberos (#7246)|https://github.com/apache/seatunnel/commit/e3001207c8|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[feature][connector-file-local] add save mode function for localfile (#7080)|https://github.com/apache/seatunnel/commit/7b2f538310|2.3.6| |[Hotfix][Hive Connector] Fix Hive hdfs-site.xml and hive-site.xml not be load error (#7069)|https://github.com/apache/seatunnel/commit/c23a577f34|2.3.6| |[Feature][Connector-V2] Add Huawei Cloud OBS connector (#4578)|https://github.com/apache/seatunnel/commit/d266f4db64|2.3.6| |[Improve][File Connector]Improve xml read code & fix can not use true for a boolean option (#6930)|https://github.com/apache/seatunnel/commit/c13a563994|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[bigfix][S3 File]:Change the [SCHEMA] attribute of the [S3CONF class] to be non-static to avoid being reassigned after deserialization (#6717)|https://github.com/apache/seatunnel/commit/79bb70101a|2.3.6| |[Improve] Improve read with parquet type convert error (#6683)|https://github.com/apache/seatunnel/commit/6c65805699|2.3.5| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Feature][Tool] Add connector check script for issue 6199 (#6635)|https://github.com/apache/seatunnel/commit/65aedf6a79|2.3.5| |[Bug] Fix OrcWriteStrategy/ParquetWriteStrategy doesn't login with kerberos (#6472)|https://github.com/apache/seatunnel/commit/24441c876d|2.3.5| |[Bug] [formats] Fix fail to parse line when content contains the file delimiter (#6589)|https://github.com/apache/seatunnel/commit/17e29185fa|2.3.5| |[Improve][Connector-V2] Support read orc with schema config to cast type (#6531)|https://github.com/apache/seatunnel/commit/d1599f8ad9|2.3.5| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Feature][Connectors-V2][File]support assign encoding for file source/sink (#6489)|https://github.com/apache/seatunnel/commit/d159fbe086|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[BugFix][Connector-file-sftp] Fix SFTPInputStream.close does not correctly trigger the closing of the file stream (#6323) (#6329)|https://github.com/apache/seatunnel/commit/eee881af91|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |Fix HiveMetaStoreProxy#enableKerberos will return true if doesn't enable kerberos (#6307)|https://github.com/apache/seatunnel/commit/1dad6f7061|2.3.4| |[Feature][Connector]add s3file save mode function (#6131)|https://github.com/apache/seatunnel/commit/81c51073bf|2.3.4| |[bugfix][file-execl] Fix the Issue of Abnormal Data Reading from Excel Files (#5932)|https://github.com/apache/seatunnel/commit/6a2b05a845|2.3.4| |[Feature][Connectors-v2-file-ftp] FTP source/sink add ftp connection mode (#6077) (#6099)|https://github.com/apache/seatunnel/commit/f6bcc4d59d|2.3.4| |Disable HDFSFileSystem cache (#6039)|https://github.com/apache/seatunnel/commit/135c91818e|2.3.4| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Feature][Connector-V2] Support read .xls excel file (#6066)|https://github.com/apache/seatunnel/commit/43787a3dde|2.3.4| |Add multiple table file sink to base (#6049)|https://github.com/apache/seatunnel/commit/085e0e5fc3|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Hotfix][Oss File Connector] fix oss connector can not run bug (#6010)|https://github.com/apache/seatunnel/commit/755bc2a730|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Feature] LocalFile sink support multiple table (#5931)|https://github.com/apache/seatunnel/commit/0fdf45f94d|2.3.4| |[Improve][File] Clean memory buffer of `JsonWriteStrategy` & `ExcelWriteStrategy` (#5925)|https://github.com/apache/seatunnel/commit/7297a4c95c|2.3.4| |[Bug][Connector][FileBase]Parquet reader parsing array type exception. (#4457)|https://github.com/apache/seatunnel/commit/5c6b11329c|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Feature] LocalFileSource support multiple table|https://github.com/apache/seatunnel/commit/72be6663ad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Improve][LocalFile] parquet use system timezone (#5605)|https://github.com/apache/seatunnel/commit/b3e13513ac|2.3.4| |[Bugfix][Connector-v2] fix file sink `isPartitionFieldWriteInFile` occurred exception when no columns are given (#5508)|https://github.com/apache/seatunnel/commit/9fb5499295|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Hotfix][File-Connector] Fix WriteStrategy parallel writing thread unsafe issue (#5546)|https://github.com/apache/seatunnel/commit/1177d02d55|2.3.4| |[Bugfix][jindo] Remove useless code (#5540)|https://github.com/apache/seatunnel/commit/b889618379|2.3.4| |[Feature] [File Connector] Supports writing column names when the output type is file (CSV) (#5459)|https://github.com/apache/seatunnel/commit/f73b37291e|2.3.4| |[bugfix][CI]remove jindo dependencies|https://github.com/apache/seatunnel/commit/38e1e30e20|2.3.4| |[Feature][Connector-V2][Oss jindo] Fix the problem of jindo driver download failure. (#5511)|https://github.com/apache/seatunnel/commit/a14d9c0d08|2.3.4| |Revert "[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)" (#5487)|https://github.com/apache/seatunnel/commit/093901068e|2.3.4| |[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)|https://github.com/apache/seatunnel/commit/de7b86a5dd|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[bugfix] [File Base] Fix Hadoop Kerberos authentication related issues. (#5171)|https://github.com/apache/seatunnel/commit/2a85525f4c|2.3.3| |[Feature][Connector-V2][File] Add cos source&sink (#4979)|https://github.com/apache/seatunnel/commit/1f94676436|2.3.3| |[Improve][Connector[File] Optimize files commit order (#5045)|https://github.com/apache/seatunnel/commit/1e18a8c530|2.3.3| |[Improve][Connector-V2][OSS-Jindo] Optimize jindo oss connector (#4964)|https://github.com/apache/seatunnel/commit/5fbfd05061|2.3.3| |[Feature][E2E][FtpFile] add ftp file e2e test case (#4647)|https://github.com/apache/seatunnel/commit/b1b1f5e7e0|2.3.3| |[Bugfix] [Connector-V2] [File] Fix read temp file (#4876)|https://github.com/apache/seatunnel/commit/5e03d22d6c|2.3.2| |[Bug Fix] [seatunnel-connectors-v2][SFTP] Fix incorrect exception handling logic (#4720)|https://github.com/apache/seatunnel/commit/dc350e67c3|2.3.2| |[Fix][Connector-V2] Fix file-oss config check bug and amend file-oss-jindo factoryIdentifier (#4581)|https://github.com/apache/seatunnel/commit/5c4f17df20|2.3.2| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[Chore] Upgrade guava to 27.0-jre (#4238)|https://github.com/apache/seatunnel/commit/4851bee575|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add S3Catalog (#4121)|https://github.com/apache/seatunnel/commit/7d7f506547|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Hive] Support assign partitions (#3842)|https://github.com/apache/seatunnel/commit/6a4a850b4c|2.3.1| |[Bug][Connectors] Text And Json WriteStrategy lost the sinkColumnsIndexInRow (#3863)|https://github.com/apache/seatunnel/commit/7b5f6f1bc2|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector-V2][File] Allow the user to set the row delimiter as an empty string (#3854)|https://github.com/apache/seatunnel/commit/84508fcb65|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Feature][Connector-V2][File] Support skip number when reading text csv files (#3900)|https://github.com/apache/seatunnel/commit/243b6a6b23|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Improve][Connector-V2][File] File Connector add lzo compression way. (#3782)|https://github.com/apache/seatunnel/commit/8875d02589|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |fix file source connector option rule bug (#3804)|https://github.com/apache/seatunnel/commit/cab42f6eb1|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Improve][Connector-V2][HDFS] Support setting hdfs-site.xml (#3778)|https://github.com/apache/seatunnel/commit/c8d59ecac1|2.3.0| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Improve] [Connector-V2] Fix Kafka sink can't run EXACTLY_ONCE semantics (#3724)|https://github.com/apache/seatunnel/commit/5e3f196e29|2.3.0| |[Connector-V2] [File] Fix bug data file name will duplicate when use SeaTunnel Engine (#3717)|https://github.com/apache/seatunnel/commit/c96c53004f|2.3.0| |[Engine][Checkpoint]Unified naming style (#3714)|https://github.com/apache/seatunnel/commit/bc0bd3bec3|2.3.0| |[Connector][File-S3]Set AK is not required (#3713)|https://github.com/apache/seatunnel/commit/da3c526172|2.3.0| |[Hotfix][Connector-V2][File] Fix file sink connector npe (#3706)|https://github.com/apache/seatunnel/commit/a662a88fdc|2.3.0| |[Connector&Engine]Set S3 AK to optional (#3688)|https://github.com/apache/seatunnel/commit/4710918b02|2.3.0| |[Hotfix][OssFile Connector]fix ossfile bug (#3684)|https://github.com/apache/seatunnel/commit/ba6259274d|2.3.0| |[Feature][Connector-V2][Oss jindo] Add oss jindo source & sink connector (#3456)|https://github.com/apache/seatunnel/commit/2507372311|2.3.0| |[Improve][Connector-V2][File] Support split file based on batch size (#3625)|https://github.com/apache/seatunnel/commit/f39e3a531d|2.3.0| |[Connector][S3]Support s3a protocol (#3632)|https://github.com/apache/seatunnel/commit/ae4cc9c1ec|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Hotfix][Connector-V2][Hive] Fix npe of getting file system (#3506)|https://github.com/apache/seatunnel/commit/e1fc3d1b01|2.3.0| |[Improve][core-v1][seatunnel-core-base] remove seatunnel-core-base (#3480)|https://github.com/apache/seatunnel/commit/d6e6a02a36|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Hotfix][Connector-V2][Hive] Fix the bug that when write data to hive throws NullPointerException (#3258)|https://github.com/apache/seatunnel/commit/777bf6b42e|2.3.0| |[Core] [Improve] Fix some sonar check error (#3240)|https://github.com/apache/seatunnel/commit/8664bb53a5|2.3.0| |[Bug]add 3node worker done test and fix some bug (#3115)|https://github.com/apache/seatunnel/commit/bc852a4dff|2.3.0| |[Feature][Connector-V2][SFTP] Add SFTP file source & sink connector (#3006)|https://github.com/apache/seatunnel/commit/9e496383b8|2.3.0| |[Feature][Connector-V2][S3] Add S3 file source & sink connector (#3119)|https://github.com/apache/seatunnel/commit/f27d68ca9c|2.3.0-beta| |[Feature][Connector-V2][File] Fix filesystem get error (#3117)|https://github.com/apache/seatunnel/commit/7404c180de|2.3.0-beta| |[Improve][Connector-v2][file] Reuse array type container when read row data (#3123)|https://github.com/apache/seatunnel/commit/da0646ac6d|2.3.0-beta| |[Hotfix][Connector-V2][File] Fix ParquetReadStrategy get NPE (#3122)|https://github.com/apache/seatunnel/commit/ba99de08c8|2.3.0-beta| |[hotfix][engine] Add master node switch test and fix bug (#3082)|https://github.com/apache/seatunnel/commit/608be51bc4|2.3.0-beta| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[hotfix][connector][file] Solved the bug of can not parse '\t' as delimiter from config file (#3083)|https://github.com/apache/seatunnel/commit/bfde596754|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Improve][Connector-V2] Improve text write (#2971)|https://github.com/apache/seatunnel/commit/0ecd7906c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Bug][Connector-V2][File] Fix the bug of incorrect path in windows environment (#2980)|https://github.com/apache/seatunnel/commit/2e16161865|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][connector-file-base] Fix source split assigning reader to negative number (#2921)|https://github.com/apache/seatunnel/commit/0b5a2852fb|2.3.0-beta| |[Improve][Connector-V2] Improve orc write strategy to support all data types (#2860)|https://github.com/apache/seatunnel/commit/4d048cc23e|2.3.0-beta| |[Fix] [Connector-V2-File] Fix file connector bug (#2858)|https://github.com/apache/seatunnel/commit/e0459bbab6|2.2.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Imporve][Connector-V2] Refactor ftp sink & Add ftp file source (#2774)|https://github.com/apache/seatunnel/commit/4aacbcdd1f|2.2.0-beta| |[Bug] [Connector-V2] Fix hive source connector parallelism not work (#2823)|https://github.com/apache/seatunnel/commit/9f21d4c769|2.2.0-beta| |[Improve][Connector-V2] Imporve orc read strategy (#2747)|https://github.com/apache/seatunnel/commit/af34beda37|2.2.0-beta| |[Bug][Connector-V2] Fix error option (#2775)|https://github.com/apache/seatunnel/commit/488e561eef|2.2.0-beta| |[Improve][Connector-V2] Refactor hdfs file sink connector code structure (#2701)|https://github.com/apache/seatunnel/commit/6129c02567|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[Improve][Connector-V2] Refactor local file sink connector code structure (#2655)|https://github.com/apache/seatunnel/commit/6befd599a1|2.2.0-beta| |[Feature][Connector-V2] Add oss sink (#2629)|https://github.com/apache/seatunnel/commit/bb2ad40487|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Improve][Connector-V2] Refactor the structure of file sink to reduce redundant codes (#2555)|https://github.com/apache/seatunnel/commit/6315092930|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add oss source connector (#2467)|https://github.com/apache/seatunnel/commit/712b77744e|2.2.0-beta| |[Feature][File connector] Support ftp file sink (#2483)|https://github.com/apache/seatunnel/commit/a87e5de80a|2.2.0-beta| |[Feature][Connector-V2] Local file json support (#2465)|https://github.com/apache/seatunnel/commit/65a92f2496|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file json support (#2451)|https://github.com/apache/seatunnel/commit/84f6b17c15|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file source connector (#2420)|https://github.com/apache/seatunnel/commit/4fb6f2a216|2.2.0-beta| |[Feature][Connector-V2] Add local file connector source (#2419)|https://github.com/apache/seatunnel/commit/eff595c452|2.2.0-beta| |[Feature][Connector-V2] Add base source connector code for connector-file-base (#2399)|https://github.com/apache/seatunnel/commit/1829ddc662|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of local file connector (#2403)|https://github.com/apache/seatunnel/commit/a538daed5c|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that file connector release resources multi times (#2379)|https://github.com/apache/seatunnel/commit/58c64aab2a|2.2.0-beta| |[Improve][Connector-V2] Optimize the code structure (#2380)|https://github.com/apache/seatunnel/commit/7376ec7ab1|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |Replace plain string with constants (#2308)|https://github.com/apache/seatunnel/commit/3c0415e56e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-fluss.md ================================================
Change Log | Change | Commit | Version | |--------|--------|---------|
================================================ FILE: docs/en/connectors/changelog/connector-google-firestore.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve] filestore options (#8921)|https://github.com/apache/seatunnel/commit/b60ef97c95|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector][GoogleFirestore-Sink] Support GoogleFirestore Sink (#4304)|https://github.com/apache/seatunnel/commit/f13c2614d2|2.3.2|
================================================ FILE: docs/en/connectors/changelog/connector-google-sheets.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] google sheets options (#8922)|https://github.com/apache/seatunnel/commit/48ede612dc|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][GoogleSheets] Unified exception for GoogleSheets source connector (#3524)|https://github.com/apache/seatunnel/commit/eb42d629ad|2.3.0| |[Feature][Connector-V2][Google Sheets] Add Google Sheets option rules (#3364)|https://github.com/apache/seatunnel/commit/da33f730ca|2.3.0| |fix: schema get error (#3361)|https://github.com/apache/seatunnel/commit/fdaa85ed24|2.3.0| |[Feature][Connector-V2][GoogleSheets] Support GoogleSheets Source (#3185)|https://github.com/apache/seatunnel/commit/60ecc6428b|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-graphql.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[Feature][Connector-V2] Support GraphQL Connector (#8557) (#9021)|https://github.com/apache/seatunnel/commit/9eec2520c0|2.3.11|
================================================ FILE: docs/en/connectors/changelog/connector-hbase.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] hbase options (#8923)|https://github.com/apache/seatunnel/commit/b6a702b58f|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Feature][Connector-V2][Hbase] implement hbase catalog (#7516)|https://github.com/apache/seatunnel/commit/b978792cb1|2.3.8| |[Feature][Connector-V2] Support multi-table sink feature for HBase (#7169)|https://github.com/apache/seatunnel/commit/025fa3bb88|2.3.8| |[hotfix][connector-v2-hbase]fix and optimize hbase source problem (#7148)|https://github.com/apache/seatunnel/commit/34a6b8e9f6|2.3.7| |[Improve][hbase] The specified column is written to the specified column family (#5234)|https://github.com/apache/seatunnel/commit/49d397c61d|2.3.6| |[feature][connector-v2-hbase-sink] Support Connector v2 HBase sink TTL data writing (#7116)|https://github.com/apache/seatunnel/commit/adafd80255|2.3.6| |[E2E][HBase]Refactor hbase e2e (#6859)|https://github.com/apache/seatunnel/commit/1da9bd6ce4|2.3.6| |[Connector]Add hbase source connector (#6348)|https://github.com/apache/seatunnel/commit/f108a5e658|2.3.6| |[Feature][HbaseSink]support array data. (#6100)|https://github.com/apache/seatunnel/commit/b592014766|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Hotfix][Connector-v2][HbaseSink]Fix default timestamp (#4958)|https://github.com/apache/seatunnel/commit/3d8f3bf902|2.3.3| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Hbase] Introduce hbase sink connector (#4049)|https://github.com/apache/seatunnel/commit/68bda94a4c|2.3.1|
================================================ FILE: docs/en/connectors/changelog/connector-hive.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature][connector-hive] hive sink connector support overwrite mode #7843 (#7891)|https://github.com/apache/seatunnel/commit/6fafe6f4d3|2.3.12| |[Fix][Connector-V2] Fix hive client thread unsafe (#9282)|https://github.com/apache/seatunnel/commit/5dc25897a9|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Improve][connector-hive] Improved hive file allocation algorithm for subtasks (#8876)|https://github.com/apache/seatunnel/commit/89d1878ade|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Hive] Writing parquet files supports the optional timestamp int96 (#8509)|https://github.com/apache/seatunnel/commit/856aea1952|2.3.10| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Fix][Connector-V2] Fix hive krb5 path not work (#8228)|https://github.com/apache/seatunnel/commit/e18a4d07b4|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][E2E] Add hive3 e2e test case (#8003)|https://github.com/apache/seatunnel/commit/9a24fac2c4|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Zeta] Split the classloader of task group (#7580)|https://github.com/apache/seatunnel/commit/3be0d1cc61|2.3.8| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Hive] Close resources when exception occurs (#7205)|https://github.com/apache/seatunnel/commit/561171528b|2.3.6| |[Hotfix][Hive Connector] Fix Hive hdfs-site.xml and hive-site.xml not be load error (#7069)|https://github.com/apache/seatunnel/commit/c23a577f34|2.3.6| |Fix hive load hive_site_path and hdfs_site_path too late (#7017)|https://github.com/apache/seatunnel/commit/e2578a5b4d|2.3.6| |[Bug] [connector-hive] Eanble login with kerberos for hive (#6893)|https://github.com/apache/seatunnel/commit/26e433e472|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Feature] Hive Source/Sink support multiple table (#5929)|https://github.com/apache/seatunnel/commit/4d9287fce4|2.3.6| |[Improve][Hive] udpate hive3 version (#6699)|https://github.com/apache/seatunnel/commit/1184c05c29|2.3.6| |[HiveSink]Fix the risk of resource leakage. (#6721)|https://github.com/apache/seatunnel/commit/c23804f13b|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[Fix][Connector-V2] Fix add hive partition error when partition already existed (#6577)|https://github.com/apache/seatunnel/commit/2a0a0b9d19|2.3.5| |Fix HiveMetaStoreProxy#enableKerberos will return true if doesn't enable kerberos (#6307)|https://github.com/apache/seatunnel/commit/1dad6f7061|2.3.4| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Hotfix][Connector-V2][Hive] fix the bug that hive-site.xml can not be injected in HiveConf (#5261)|https://github.com/apache/seatunnel/commit/04ce22ac1e|2.3.4| |[Improve][Connector-v2][HiveSink]remove drop partition when abort. (#4940)|https://github.com/apache/seatunnel/commit/edef87b523|2.3.3| |[feature][web] hive add option because web need (#5154)|https://github.com/apache/seatunnel/commit/5e1511ff0d|2.3.3| |[Hotfix][Connector-V2][Hive] Support user-defined hive-site.xml (#4965)|https://github.com/apache/seatunnel/commit/2a064bcdb0|2.3.3| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[hotfix] fixed schema options import error|https://github.com/apache/seatunnel/commit/656805f2df|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Hotfix][Connector-V2][Hive] Fix hive unknownhost (#4141)|https://github.com/apache/seatunnel/commit/f1a1dfe4af|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Hive] Support assign partitions (#3842)|https://github.com/apache/seatunnel/commit/6a4a850b4c|2.3.1| |[Improve][Connector-V2][Hive] Improve config check logic (#3886)|https://github.com/apache/seatunnel/commit/b4348f6f44|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Hotfix][Connector-V2][Hive] Fix npe of getting file system (#3506)|https://github.com/apache/seatunnel/commit/e1fc3d1b01|2.3.0| |[Improve][Connector-V2][Hive] Unified exceptions for hive source & sink connector (#3541)|https://github.com/apache/seatunnel/commit/12c0fb91d2|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Hotfix][Connector-V2][Hive] Fix the bug that when write data to hive throws NullPointerException (#3258)|https://github.com/apache/seatunnel/commit/777bf6b42e|2.3.0| |[Improve][Connector-V2][Hive] Hive Sink Support msck partitions (#3133)|https://github.com/apache/seatunnel/commit/a8738ef3c4|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Engine][Merge] fix merge problem|https://github.com/apache/seatunnel/commit/0e9ceeefc9|2.3.0-beta| |Merge remote-tracking branch 'upstream/dev' into st-engine|https://github.com/apache/seatunnel/commit/ca80df779a|2.3.0-beta| |update hive.metastore.version to hive.exec.version (#2879)|https://github.com/apache/seatunnel/commit/018ee0a3db|2.2.0-beta| |[Bug][Connector-V2] Fix hive sink bug (#2870)|https://github.com/apache/seatunnel/commit/d661fa011e|2.2.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Bug][Connector-V2] Fix hive source text table name (#2797)|https://github.com/apache/seatunnel/commit/563637ebd1|2.2.0-beta| |[Improve][Connector-V2] Refactor hive source & sink connector (#2708)|https://github.com/apache/seatunnel/commit/a357dca365|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706) (#2731)|https://github.com/apache/seatunnel/commit/e8929ab605|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add orc file support in connector hive sink (#2311) (#2374)|https://github.com/apache/seatunnel/commit/81cb80c050|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |Decide table format using outputFormat in HiveSinkConfig #2303|https://github.com/apache/seatunnel/commit/3a2586f6dc|2.2.0-beta| |[Feature][Connector-V2-Hive] Add parquet file format support to Hive Sink (#2310)|https://github.com/apache/seatunnel/commit/4ab3c21b8d|2.2.0-beta| |Add BaseHiveCommitInfo for common hive commit info (#2306)|https://github.com/apache/seatunnel/commit/0d2f6f4d7c|2.2.0-beta| |Remove same code to independent method in HiveSinkWriter (#2307)|https://github.com/apache/seatunnel/commit/e99e6ee726|2.2.0-beta| |Avoid potential null pointer risk in HiveSinkWriter#snapshotState (#2302)|https://github.com/apache/seatunnel/commit/e7d817f7d2|2.2.0-beta| |[Connector-V2] Add file type check logic in hive connector (#2275)|https://github.com/apache/seatunnel/commit/5488337c67|2.2.0-beta| |[Connector-V2] Add parquet file reader for Hive Source Connector (#2199) (#2237)|https://github.com/apache/seatunnel/commit/59db97ed34|2.2.0-beta| |Merge from dev to st-engine (#2243)|https://github.com/apache/seatunnel/commit/41e530afd5|2.3.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Bug][connector-hive] filter '_SUCCESS' file in file list (#2235) (#2236)|https://github.com/apache/seatunnel/commit/db04651523|2.2.0-beta| |[Bug][hive-connector-v2] Resolve the schema inconsistency bug (#2229) (#2230)|https://github.com/apache/seatunnel/commit/62ca075915|2.2.0-beta| |[Bug][spark-connector-v2-example] fix the bug of no class found. (#2191) (#2192)|https://github.com/apache/seatunnel/commit/5dbc2df17e|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta| |[Connector-V2]Hive Source (#2123)|https://github.com/apache/seatunnel/commit/ffcf3f59e2|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-http-airtable.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- |
================================================ FILE: docs/en/connectors/changelog/connector-http-base.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connectors-v2] Fix UT for connector-http (#9821)|https://github.com/apache/seatunnel/commit/2653f6798e| dev | |[Fix][connector-http] fix parsing httpjson, the number of two fields is inconsistent with the import failure (#9103)|https://github.com/apache/seatunnel/commit/c8ade098ee|2.3.12| |[Fix][Connector-HTTP] Add default content-type when user not set (#9497)|https://github.com/apache/seatunnel/commit/8da0a78c1d|2.3.12| |[Bug][connector-http] Fix paging request running infinitely (#9504)|https://github.com/apache/seatunnel/commit/1844e04c97|2.3.12| |[Bug] [seatunnel-connector-http-base] An NPE (NullPointerException) will occur when the pageField is null (#9498)|https://github.com/apache/seatunnel/commit/b898a3225c|2.3.12| |[Fix][Connector-Http] fix Invalid mime type (#9363)|https://github.com/apache/seatunnel/commit/4d7d765a26|2.3.12| |[Feature][http-Sink] Implementing http batch writes (#9292)|https://github.com/apache/seatunnel/commit/04ee8aca04|2.3.11| |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[Improve][Connector-V2][Http] Supports Cursor-based Pagination (#9109) (#9138)|https://github.com/apache/seatunnel/commit/879b1e2d5b|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Add prometheus source and sink (#7265)|https://github.com/apache/seatunnel/commit/dde6f9fcbd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix http source can not read streaming (#7703)|https://github.com/apache/seatunnel/commit/a0ffa7ba02|2.3.8| |[Feature][Connector-V2] Suport choose the start page in http paging (#7180)|https://github.com/apache/seatunnel/commit/ed15f0dcf9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |Fix HttpSource bug (#6824)|https://github.com/apache/seatunnel/commit/c3ab84caa4|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Improve][Connector-V2]Support multi-table sink feature for httpsink (#6316)|https://github.com/apache/seatunnel/commit/e6c51a95c7|2.3.5| |[Improve][HttpConnector]Increase custom configuration timeout. (#6223)|https://github.com/apache/seatunnel/commit/fa5b7d3d83|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[BUG][Connector-V2][Http] fix bug http config no schema option and improve e2e test add case (#5939)|https://github.com/apache/seatunnel/commit/8a71b9e072|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Transform] add JsonPath transform (#5632)|https://github.com/apache/seatunnel/commit/d908f0af40|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector-V2] HTTP supports page increase #5477 (#5561)|https://github.com/apache/seatunnel/commit/bb180b2988|2.3.4| |[improve][Connector-V2][http] improve http e2e test (#5655)|https://github.com/apache/seatunnel/commit/f5867adcaa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[BUG][Connector-V2][http] fix httpheader cover (#5446)|https://github.com/apache/seatunnel/commit/cdd8e0a65e|2.3.4| |[Feature][Connector][Http] Support multi-line text splits (#4698)|https://github.com/apache/seatunnel/commit/6a524981cb|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix] [seatunnel-connectors-v2] [connector-http] fix http json request error (#3629)|https://github.com/apache/seatunnel/commit/54f594d6ca|2.3.0| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Improve][Connector-V2][Http]Unified exception for http source & sink… (#3594)|https://github.com/apache/seatunnel/commit/d798cd8670|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][format][json] Fix jackson package conflict with spark (#2934)|https://github.com/apache/seatunnel/commit/1a92b8369b|2.3.0-beta| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| |[Improve][Connector-V2] Improve http connector (#2833)|https://github.com/apache/seatunnel/commit/5b3957bc52|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that set params by mistake (#2511) (#2513)|https://github.com/apache/seatunnel/commit/ead3d68b0e|2.2.0-beta| |[Improve][Connector-V2] Http source support user-defined schema (#2439)|https://github.com/apache/seatunnel/commit/793933b6b8|2.2.0-beta| |[Improve][Connector-V2] Format SeaTunnelRow use seatunnel-format-json (#2435)|https://github.com/apache/seatunnel/commit/e4e8f7fbff|2.2.0-beta| |[Improve][Connector-V2] Make the attribute of http-connector from private to protected (#2418)|https://github.com/apache/seatunnel/commit/f3b00ef696|2.2.0-beta| |[Feature][Connector-V2] Add feishu sink (#2381)|https://github.com/apache/seatunnel/commit/0fec8ca438|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-http-feishu.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2] Add feishu sink (#2381)|https://github.com/apache/seatunnel/commit/0fec8ca438|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-http-github.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Feature][Connector-V2][Github] Adding Github Source Connector (#4155)|https://github.com/apache/seatunnel/commit/49d9172b10|2.3.1|
================================================ FILE: docs/en/connectors/changelog/connector-http-gitlab.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Gitlab] Unified excetion for Gitlab connector and improve optione rule (#3533)|https://github.com/apache/seatunnel/commit/77f68f1eef|2.3.0| |[Feature][Connector V2] add gitlab source connector (#3408)|https://github.com/apache/seatunnel/commit/545595c6d2|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-jira.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Jira]Add Jira source connector (#3473)|https://github.com/apache/seatunnel/commit/fb40162c07|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-klaviyo.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Klaviyo]Unified exception for Klaviyo connector (#3555)|https://github.com/apache/seatunnel/commit/08f8615078|2.3.0| |[Feature][Connector-V2][Klaviyo]Add Klaviyo source connector (#3443)|https://github.com/apache/seatunnel/commit/fc00a2866b|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-lemlist.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Lemlist] Unified exception for lemlist connector (#3534)|https://github.com/apache/seatunnel/commit/705728ebbb|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-myhours.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MyHours]Unified exception for MyHours connector (#3538)|https://github.com/apache/seatunnel/commit/48ab7c97d5|2.3.0| |[HotFix][Core][API] Fix OptionValidation error code (#3439)|https://github.com/apache/seatunnel/commit/ace219f376|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-notion.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Notion] Add Notion source connector (#3470)|https://github.com/apache/seatunnel/commit/46abc6d943|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-onesignal.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Improve][Connector-V2][OneSignal]Unified exception for OneSignal connector (#3609)|https://github.com/apache/seatunnel/commit/97cce8c255|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][OneSignal]Add OneSignal source conector (#3454)|https://github.com/apache/seatunnel/commit/b318b3166f|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-http-persistiq.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Persistiq]Add Persistiq source connector (#3460)|https://github.com/apache/seatunnel/commit/aec3912edf|2.3.1|
================================================ FILE: docs/en/connectors/changelog/connector-http-wechat.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| | [Feature][Connector-V2] Add Enterprise Wechat sink connector (#2412)|https://github.com/apache/seatunnel/commit/3e200e0a38|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-http.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connectors-v2] Fix UT for connector-http (#9821)|https://github.com/apache/seatunnel/commit/2653f6798e| dev | |[Fix][connector-http] fix parsing httpjson, the number of two fields is inconsistent with the import failure (#9103)|https://github.com/apache/seatunnel/commit/c8ade098ee|2.3.12| |[Fix][Connector-HTTP] Add default content-type when user not set (#9497)|https://github.com/apache/seatunnel/commit/8da0a78c1d|2.3.12| |[Bug][connector-http] Fix paging request running infinitely (#9504)|https://github.com/apache/seatunnel/commit/1844e04c97|2.3.12| |[Bug] [seatunnel-connector-http-base] An NPE (NullPointerException) will occur when the pageField is null (#9498)|https://github.com/apache/seatunnel/commit/b898a3225c|2.3.12| |[Fix][Connector-Http] fix Invalid mime type (#9363)|https://github.com/apache/seatunnel/commit/4d7d765a26|2.3.12| |[Feature][http-Sink] Implementing http batch writes (#9292)|https://github.com/apache/seatunnel/commit/04ee8aca04|2.3.11| |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[Improve][Connector-V2][Http] Supports Cursor-based Pagination (#9109) (#9138)|https://github.com/apache/seatunnel/commit/879b1e2d5b|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Add prometheus source and sink (#7265)|https://github.com/apache/seatunnel/commit/dde6f9fcbd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix http source can not read streaming (#7703)|https://github.com/apache/seatunnel/commit/a0ffa7ba02|2.3.8| |[Feature][Connector-V2] Suport choose the start page in http paging (#7180)|https://github.com/apache/seatunnel/commit/ed15f0dcf9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |Fix HttpSource bug (#6824)|https://github.com/apache/seatunnel/commit/c3ab84caa4|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Improve][Connector-V2]Support multi-table sink feature for httpsink (#6316)|https://github.com/apache/seatunnel/commit/e6c51a95c7|2.3.5| |[Improve][HttpConnector]Increase custom configuration timeout. (#6223)|https://github.com/apache/seatunnel/commit/fa5b7d3d83|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[BUG][Connector-V2][Http] fix bug http config no schema option and improve e2e test add case (#5939)|https://github.com/apache/seatunnel/commit/8a71b9e072|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Transform] add JsonPath transform (#5632)|https://github.com/apache/seatunnel/commit/d908f0af40|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector-V2] HTTP supports page increase #5477 (#5561)|https://github.com/apache/seatunnel/commit/bb180b2988|2.3.4| |[improve][Connector-V2][http] improve http e2e test (#5655)|https://github.com/apache/seatunnel/commit/f5867adcaa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[BUG][Connector-V2][http] fix httpheader cover (#5446)|https://github.com/apache/seatunnel/commit/cdd8e0a65e|2.3.4| |[Feature][Connector][Http] Support multi-line text splits (#4698)|https://github.com/apache/seatunnel/commit/6a524981cb|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Feature][Connector-V2][Github] Adding Github Source Connector (#4155)|https://github.com/apache/seatunnel/commit/49d9172b10|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Persistiq]Add Persistiq source connector (#3460)|https://github.com/apache/seatunnel/commit/aec3912edf|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][Connector-V2][Notion] Add Notion source connector (#3470)|https://github.com/apache/seatunnel/commit/46abc6d943|2.3.0| |[Hotfix] [seatunnel-connectors-v2] [connector-http] fix http json request error (#3629)|https://github.com/apache/seatunnel/commit/54f594d6ca|2.3.0| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Improve][Connector-V2][OneSignal]Unified exception for OneSignal connector (#3609)|https://github.com/apache/seatunnel/commit/97cce8c255|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Improve][Connector-V2][Http]Unified exception for http source & sink… (#3594)|https://github.com/apache/seatunnel/commit/d798cd8670|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MyHours]Unified exception for MyHours connector (#3538)|https://github.com/apache/seatunnel/commit/48ab7c97d5|2.3.0| |[Improve][Connector-V2][Gitlab] Unified excetion for Gitlab connector and improve optione rule (#3533)|https://github.com/apache/seatunnel/commit/77f68f1eef|2.3.0| |[Improve][Connector-V2][Klaviyo]Unified exception for Klaviyo connector (#3555)|https://github.com/apache/seatunnel/commit/08f8615078|2.3.0| |[Feature][Connector-V2][Jira]Add Jira source connector (#3473)|https://github.com/apache/seatunnel/commit/fb40162c07|2.3.0| |[Improve][Connector-V2][Lemlist] Unified exception for lemlist connector (#3534)|https://github.com/apache/seatunnel/commit/705728ebbb|2.3.0| |[Feature][Connector V2] add gitlab source connector (#3408)|https://github.com/apache/seatunnel/commit/545595c6d2|2.3.0| |[Feature][Connector-V2][OneSignal]Add OneSignal source conector (#3454)|https://github.com/apache/seatunnel/commit/b318b3166f|2.3.0| |[Feature][Connector-V2][Klaviyo]Add Klaviyo source connector (#3443)|https://github.com/apache/seatunnel/commit/fc00a2866b|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0| |[HotFix][Core][API] Fix OptionValidation error code (#3439)|https://github.com/apache/seatunnel/commit/ace219f376|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][format][json] Fix jackson package conflict with spark (#2934)|https://github.com/apache/seatunnel/commit/1a92b8369b|2.3.0-beta| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| |[Improve][Connector-V2] Improve http connector (#2833)|https://github.com/apache/seatunnel/commit/5b3957bc52|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that set params by mistake (#2511) (#2513)|https://github.com/apache/seatunnel/commit/ead3d68b0e|2.2.0-beta| |[Improve][Connector-V2] Http source support user-defined schema (#2439)|https://github.com/apache/seatunnel/commit/793933b6b8|2.2.0-beta| | [Feature][Connector-V2] Add Enterprise Wechat sink connector (#2412)|https://github.com/apache/seatunnel/commit/3e200e0a38|2.2.0-beta| |[Improve][Connector-V2] Format SeaTunnelRow use seatunnel-format-json (#2435)|https://github.com/apache/seatunnel/commit/e4e8f7fbff|2.2.0-beta| |[Improve][Connector-V2] Make the attribute of http-connector from private to protected (#2418)|https://github.com/apache/seatunnel/commit/f3b00ef696|2.2.0-beta| |[Feature][Connector-V2] Add feishu sink (#2381)|https://github.com/apache/seatunnel/commit/0fec8ca438|2.2.0-beta| |[Feature][Connector-V2] Add http sink(Webhook) (#2348)|https://github.com/apache/seatunnel/commit/4b7207490a|2.2.0-beta| |[Improve][Http Connector-V2-Source] Refactor the code and make code more clearly (#2322)|https://github.com/apache/seatunnel/commit/a9a797ad85|2.2.0-beta| |[Improve][Connector-V2] Fix the log information (#2317)|https://github.com/apache/seatunnel/commit/736983a708|2.2.0-beta| |[Improve][Connector-V2] Http client provider improve (#2312)|https://github.com/apache/seatunnel/commit/cc950007c8|2.2.0-beta| |[Improve][Connector-V2] Fix 'Singleton' word error (#2309)|https://github.com/apache/seatunnel/commit/12ebcb4a0d|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-hudi.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Core]fix kotlin jar conflict (#9683)|https://github.com/apache/seatunnel/commit/c4ec5c0be5|2.3.12| |[Improve][Connector-Hudi] Add pre-combine field option for hudi sink (#9496)|https://github.com/apache/seatunnel/commit/f134d7e129|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] hudi options (#8952)|https://github.com/apache/seatunnel/commit/b24d0e7f86|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][CI]skip ui module, improve module dependent (#8225)|https://github.com/apache/seatunnel/commit/81de0a69cc|2.3.9| |[Feature][Connector-V2] Support write cdc changelog event into hudi sink (#7845)|https://github.com/apache/seatunnel/commit/934434cc75|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Optimize hudi sink (#7662)|https://github.com/apache/seatunnel/commit/0d12520f91|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |Bump org.xerial.snappy:snappy-java (#7144)|https://github.com/apache/seatunnel/commit/aa26471fb7|2.3.6| |[Feature][Connector-V2] [Hudi]Add hudi sink connector (#4405)|https://github.com/apache/seatunnel/commit/dc271dcfb4|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Hotfix][Zeta] Fix conflict dependency of hadoop-hdfs (#4509)|https://github.com/apache/seatunnel/commit/66923fbdbd|2.3.2| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Feature][Connector V2] expose configurable options in Hudi (#3383)|https://github.com/apache/seatunnel/commit/fd4cec3a95|2.3.0| |fix hudi connector v2 compile error. (#3728)|https://github.com/apache/seatunnel/commit/4fba0aa024|2.3.0| |[Improve][Connector-V2][Hudi] Unified exception for hudi source connector (#3581)|https://github.com/apache/seatunnel/commit/b2fda11ddc|2.3.0| |[bug][Connector-V2][Hudi] HashCode may be negative (#3184)|https://github.com/apache/seatunnel/commit/8beffbb603|2.3.0| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2]Add Hudi Source (#2147)|https://github.com/apache/seatunnel/commit/eaedc0a3c7|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-hugegraph.md ================================================
Change Log | Change | Commit | Version | | --- | --- |---------| |[Feature][Connector-V2] Support sink connector for Apache HugeGraph|https://github.com/apache/seatunnel/pull/10002/commits/002a653d11f48c3f76b47db23f5f2a68bc9d690c| 2.3.12 |
================================================ FILE: docs/en/connectors/changelog/connector-iceberg.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][Core] Unify the aws-sdk-v2 version to 2.31.30 (#9698)|https://github.com/apache/seatunnel/commit/41c251cc8a|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Bug] [Connector-V2] Fix the issue of writing the ORC format Iceberg report "Illegal provider-class name" (#6754) (#9588)|https://github.com/apache/seatunnel/commit/74b193dd5a|2.3.12| |[Bug] [Connector-V2] Updates Iceberg version to 1.6.1 (#9387) (#9451)|https://github.com/apache/seatunnel/commit/7b92a6c5c1|2.3.12| |[Fix][Connector-Iceberg] Fix Time Zone Issue for Iceberg Timestamp Type (#9460)|https://github.com/apache/seatunnel/commit/60cd497610|2.3.12| |[Feature][Connector-V2] Iceberg add glue catalog support (#9247)|https://github.com/apache/seatunnel/commit/ecff2e8618|2.3.11| |[Improve] Remove useless iceberg sink config `iceberg.table.config` (#9307)|https://github.com/apache/seatunnel/commit/fbdf39ebf2|2.3.11| |[Improve][connector-iceberg] fix schema change event (#9217)|https://github.com/apache/seatunnel/commit/56669095b7|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feat][Connector-v2][Iceberg]support filter conditions in iceberg source (#9095)|https://github.com/apache/seatunnel/commit/0eb72780ee|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Improve] iceberg options (#8967)|https://github.com/apache/seatunnel/commit/82a374ec87|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Iceberg] Support read multi-table (#8524)|https://github.com/apache/seatunnel/commit/2bfb97e502|2.3.10| |[Improve][Iceberg] Filter catalog table primaryKey is empty (#8413)|https://github.com/apache/seatunnel/commit/857aab5e83|2.3.9| |[Improve][Connector-V2] Reduce the create times of iceberg sink writer (#8155)|https://github.com/apache/seatunnel/commit/45a7a715a2|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Iceberg] Support custom delete sql for sink savemode (#8094)|https://github.com/apache/seatunnel/commit/29ca928c36|2.3.9| |[Improve][Connector-V2] Reduce the request times of iceberg load table (#8149)|https://github.com/apache/seatunnel/commit/555f5eb404|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Improve][Iceberg] Support table comment for catalog (#7936)|https://github.com/apache/seatunnel/commit/72ab38f317|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix iceberg throw java: package sun.security.krb5 does not exist when use jdk 11 (#7734)|https://github.com/apache/seatunnel/commit/116af4febc|2.3.8| |[Hotfix][Connector-V2] Release resources when task is closed for iceberg sinkwriter (#7729)|https://github.com/apache/seatunnel/commit/ff281183bd|2.3.8| |[Fix][Connector-V2] Fixed iceberg sink can not handle uppercase fields (#7660)|https://github.com/apache/seatunnel/commit/b7be0cb4a1|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Improve][Iceberg] Add savemode create table primaryKey testcase (#7641)|https://github.com/apache/seatunnel/commit/6b36f90f4d|2.3.8| |[Hotfix] Fix iceberg missing column comment when savemode create table (#7608)|https://github.com/apache/seatunnel/commit/b35bd94bfb|2.3.8| |[Improve][Connector-V2] Remove hard code iceberg table format version (#7500)|https://github.com/apache/seatunnel/commit/f49b263e65|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Feature][Connector-V2][Iceberg] Support Iceberg Kerberos (#7246)|https://github.com/apache/seatunnel/commit/e3001207c8|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Bug][Connector-Iceberg]fix create iceberg v2 table with pks (#6895)|https://github.com/apache/seatunnel/commit/40d2c1b213|2.3.6| |[Feature][Connector-V2] Iceberg-sink supports writing data to branches (#6697)|https://github.com/apache/seatunnel/commit/e3103535cc|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Zeta] Add classloader cache mode to fix metaspace leak (#6355)|https://github.com/apache/seatunnel/commit/9c3c2f183d|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[BUG][Connector-V2] Iceberg source lost data with parallelism option (#5732)|https://github.com/apache/seatunnel/commit/7f3b4be075|2.3.4| |[Dependency]Bump org.apache.avro:avro in /seatunnel-connectors-v2/connector-iceberg (#5582)|https://github.com/apache/seatunnel/commit/13753a927b|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Doc][Iceberg] Improved iceberg documentation (#5335)|https://github.com/apache/seatunnel/commit/659a68a0be|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Hotfix][Connector][Iceberg] Fix iceberg source stream mode init error (#4638)|https://github.com/apache/seatunnel/commit/64760eed4d|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve][SourceConnector] Unifie Iceberg source fields to schema (#3959)|https://github.com/apache/seatunnel/commit/20e1255fab|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Iceberg] Unified exception for iceberg source connector (#3677)|https://github.com/apache/seatunnel/commit/e24843515f|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Iceberg] Modify the scope of flink-shaded-hadoop-2 to provided to be compatible with hadoop3.x (#3046)|https://github.com/apache/seatunnel/commit/b38c50789f|2.3.0| |[Feature][Connector V2] expose configurable options in Iceberg (#3394)|https://github.com/apache/seatunnel/commit/bd9a313ded|2.3.0| |[Improve][Connector][Iceberg] Improve code. (#3065)|https://github.com/apache/seatunnel/commit/9f38e3da74|2.3.0-beta| |[Code-Improve][Iceberg] Use automatic resource management to replace 'try - finally' code block. (#2909)|https://github.com/apache/seatunnel/commit/b7f640724b|2.3.0-beta| |[Feature][Connector-V2] Add iceberg source connector (#2615)|https://github.com/apache/seatunnel/commit/ffc6088a79|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-influxdb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] influxdb options (#8966)|https://github.com/apache/seatunnel/commit/9f498b8133|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Improve some connectors prepare check error message (#7465)|https://github.com/apache/seatunnel/commit/6930a25edd|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |Support multi-table sink feature for influxdb (#6278)|https://github.com/apache/seatunnel/commit/56f13e920d|2.3.5| |[Improve][Zeta] Add classloader cache mode to fix metaspace leak (#6355)|https://github.com/apache/seatunnel/commit/9c3c2f183d|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[BugFix] [InfluxDBSource] Resolve invalid SQL in initColumnsIndex method caused by direct QUERY_LIMIT appendage with 'tz' function. (#4829)|https://github.com/apache/seatunnel/commit/deed9c62c3|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] [Connector-V2] Remove scheduler in InfluxDB sink (#5271)|https://github.com/apache/seatunnel/commit/f459f500cb|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][SourceConnector] Unifie InfluxDB source fields to schema (#3897)|https://github.com/apache/seatunnel/commit/85a984a64f|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Influxdb] Unified exception for influxdb source & sink connector (#3558)|https://github.com/apache/seatunnel/commit/4686f35d68|2.3.0| |[Feature][Connector][influx] Expose configurable options in influx db (#3392)|https://github.com/apache/seatunnel/commit/b247ff0aef|2.3.0| |[Feature][Connector-V2] influxdb sink connector (#3174)|https://github.com/apache/seatunnel/commit/630e884791|2.3.0| |[Feature][Connector-V2] Add influxDB connector source (#2697)|https://github.com/apache/seatunnel/commit/1d70ea3084|2.3.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-iotdb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] iotdb options (#8965)|https://github.com/apache/seatunnel/commit/6e073935f4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Doc] update iotdb document (#5404)|https://github.com/apache/seatunnel/commit/856aedb3c9|2.3.4| |[Improve] [Connector-V2] Remove scheduler in IoTDB sink (#5270)|https://github.com/apache/seatunnel/commit/299637868c|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][SourceConnector] Unified schema parameter, update IoTDB sou… (#3896)|https://github.com/apache/seatunnel/commit/a0959c5fd1|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Iotdb] Unified exception for iotdb source & sink connector (#3557)|https://github.com/apache/seatunnel/commit/7353fed6d6|2.3.0| |[Feature][Connector V2] expose configurable options in IoTDB (#3387)|https://github.com/apache/seatunnel/commit/06359ea76a|2.3.0| |[Improve][Connector-V2][IotDB]Add IotDB sink parameter check (#3412)|https://github.com/apache/seatunnel/commit/91240a3dcb|2.3.0| |[Bug][Connector-v2] Fix IoTDB connector sink NPE (#3080)|https://github.com/apache/seatunnel/commit/e5edf02433|2.3.0-beta| |[Imporve][Connector-V2] Imporve iotdb connector (#2917)|https://github.com/apache/seatunnel/commit/3da11ce19b|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Connectors-V2]Support IoTDB Source (#2431)|https://github.com/apache/seatunnel/commit/7b78d6c922|2.2.0-beta| |[Feature][Connector-V2] Support IoTDB sink (#2407)|https://github.com/apache/seatunnel/commit/c1bbbd59d5|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-jdbc.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-xugu] Fix several bugs in the xugu connector (#9820)|https://github.com/apache/seatunnel/commit/75c9adb280| dev | |[Feature][Transform-V2] Support `AT TIME ZONE` statement for sql transform (#9784)|https://github.com/apache/seatunnel/commit/ad5278c5bb| dev | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix] [connector-jdbc] prevent precision loss in Float to BigDecimal conversion (#9670)|https://github.com/apache/seatunnel/commit/6e11285bf6|2.3.12| |[Fix][Connector-Jdbc] Supports reading and writing Postgres network dress types (#9618)|https://github.com/apache/seatunnel/commit/3dc79c1ddf|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Fix][Connector-Jdbc]Fixed Vertica data source cannot upsert data. (#9607)|https://github.com/apache/seatunnel/commit/7b4d05171b|2.3.12| |[Fix][Connectors-Jdbc] Postgres supports streaming and batch reading and writing of the `interval` data type (#9590)|https://github.com/apache/seatunnel/commit/58ab917024|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[improve][Connector-jdbc] add comments when schema not include all columns (#9559)|https://github.com/apache/seatunnel/commit/02d2b69d85|2.3.12| |[Hotfix][Connector-Jdbc] Write MySQL to support set collection data type (#9553)|https://github.com/apache/seatunnel/commit/3836c97a62|2.3.12| |[Feature][Jdbc] Support read multiple tables by regular expressions (#9380)|https://github.com/apache/seatunnel/commit/670a52a918|2.3.12| |[bugfix][Connector-V2] Fixed the load driver inaccurate situation (#9468)|https://github.com/apache/seatunnel/commit/c6639e81fe|2.3.12| |[Fix][Connector-V2] Fix OceanBase Oracle create unsupported data type (#9383)|https://github.com/apache/seatunnel/commit/f4178c72f1|2.3.12| |[improve][Connector-V2] delete jdbc param support_upsert_by_query_primary_key_exist (#9408)|https://github.com/apache/seatunnel/commit/d247fe1d8d|2.3.12| |[Feature][Connector-V2] Jdbc mysql support read tinyint(1) to byte(tinyint) (#9373)|https://github.com/apache/seatunnel/commit/7b87aa6f12|2.3.12| |[Improve] JdbcInputFormat nextRecord Exception throw TableId (#9374)|https://github.com/apache/seatunnel/commit/484aef593d|2.3.12| |[Feature][Connector-V2][JDBC] Add presto/trino dialect (#9388)|https://github.com/apache/seatunnel/commit/3cac2bd126|2.3.12| |[Feature][Connector-JDBC] Supprot read Oracle BLOB data as string instead of bytes (#9305)|https://github.com/apache/seatunnel/commit/454a88f81a|2.3.11| |[Fix][Connector-jdbc] Fix postgresql sink trying to update unique key (#9293) (#9298)|https://github.com/apache/seatunnel/commit/d0c1de8357|2.3.11| |[Fix][Connector-V2] Fix oceanbase mysql jdbc sink create statement error (#9267)|https://github.com/apache/seatunnel/commit/79f8125ea6|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Fix][Connector-V2] Fix SqlServer create table when database with dot (#9007)|https://github.com/apache/seatunnel/commit/e09445c789|2.3.11| |[Fix][Connector-V2][OceanBase] oceanbase vector support simple vector index (#9072)|https://github.com/apache/seatunnel/commit/4140cd1d8f|2.3.11| |[Improve][Connector-V2] Optimize dialect selection in jdbc (#8820)|https://github.com/apache/seatunnel/commit/92c62c5e63|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Hotfix][Jdbc] Fix mysql tinyint(1) type mapping for TypeMapper (#9012)|https://github.com/apache/seatunnel/commit/5f85d7668a|2.3.11| |[Feature][Jdbc] Add String type column split Support by charset-based splitting algorithm (#9002)|https://github.com/apache/seatunnel/commit/dbe41e74cd|2.3.11| |[Fix][Paimon] nullable and comment attribute was lost during automatic table creation (#9020)|https://github.com/apache/seatunnel/commit/eb54fdd52c|2.3.11| |[Fix][Connector-JDBC] Fix JDBC driver selection for data source connections (#8986)|https://github.com/apache/seatunnel/commit/a5aafa7301|2.3.11| |[Improve][Jdbc] Upgrade sap-hana driver from 2.14.7 to 2.23.10 (#9013)|https://github.com/apache/seatunnel/commit/9ba9f169be|2.3.11| |[Feature][Jdbc] Support sink ddl for sqlserver #8114 (#8936)|https://github.com/apache/seatunnel/commit/30aa485b38|2.3.10| |[Fix][Connector-V2] Fix parse SqlServer JDBC Url error (#8784)|https://github.com/apache/seatunnel/commit/373d2162d3|2.3.10| |[Improve][Jdbc] Support upsert for opengauss (#8627)|https://github.com/apache/seatunnel/commit/56110bf392|2.3.10| |[Improve][Jdbc] Remove useless utils. (#8793)|https://github.com/apache/seatunnel/commit/36a7533e85|2.3.10| |[Improve][Jdbc] Improve catalog connection cache (#8626)|https://github.com/apache/seatunnel/commit/6205065b25|2.3.10| |[Fix][Connector-V2] Fix jdbc sink statement buffer wrong time to clear (#8653)|https://github.com/apache/seatunnel/commit/cf35eecdfc|2.3.10| |[Feature][Jdbc] Support sink ddl for dameng (#8380)|https://github.com/apache/seatunnel/commit/5ff3427428|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Jdbc] Remove oracle 'v$database' query (#8571)|https://github.com/apache/seatunnel/commit/3cf09f61ca|2.3.10| |[Fix] [Connector-V2] Postgres support for multiple primary keys (#8526)|https://github.com/apache/seatunnel/commit/04db40d973|2.3.10| |[Feature][JDBC source] pg support char types (#8420)|https://github.com/apache/seatunnel/commit/776ac94478|2.3.9| |[Feature][Jdbc] Support sink ddl for postgresql (#8276)|https://github.com/apache/seatunnel/commit/353bbd21a1|2.3.9| |[Feature][Connector-V2] Support the jdbc connector for highgo db (#8282)|https://github.com/apache/seatunnel/commit/aa381cbfb4|2.3.9| |[Improve][Jdbc] Support nvarchar in dm (#8270)|https://github.com/apache/seatunnel/commit/2f1c54ee2e|2.3.9| |[Improve][Connector-v2] Use regex to match filedName placeholders in jdbc sink (#8222)|https://github.com/apache/seatunnel/commit/c02d4fed36|2.3.9| |[Improve][Connector-V2] Support read comment when jdbc dialect without catalog (#8196)|https://github.com/apache/seatunnel/commit/567cd54de5|2.3.9| |[Improve][Connector-V2] The interface supports jdbc respects the target database field type (#8031)|https://github.com/apache/seatunnel/commit/1de056a9a4|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Jdbc] Improve ddl write validate (#8158)|https://github.com/apache/seatunnel/commit/9cdaacddd9|2.3.9| |[Feature][Jdbc] Add Jdbc default dialect for all jdbc series database without dialect (#8132)|https://github.com/apache/seatunnel/commit/399eabcd3f|2.3.9| |[Improve][Jdbc] Refactor ddl change (#8134)|https://github.com/apache/seatunnel/commit/e1f0a238f7|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Improve][Connector-V2] Improve schema evolution on column insert after for mysql-jdbc (#8017)|https://github.com/apache/seatunnel/commit/3fb05da365|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][transform] transform support explode (#7928)|https://github.com/apache/seatunnel/commit/132278c06a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Improve][Connector-V2] Improve jdbc merge table from path and query when type is decimal (#7917)|https://github.com/apache/seatunnel/commit/8baa012ced|2.3.9| |[Fix][Connector-V2] Fix hana type loss of precision (#7912)|https://github.com/apache/seatunnel/commit/18dcca36cd|2.3.9| |[Feature][Connector-V2] Jdbc DB2 support upsert SQL (#7879)|https://github.com/apache/seatunnel/commit/139919334d|2.3.9| |[Improve][Jdbc] Optimize index name conflicts when create table for postgresql (#7875)|https://github.com/apache/seatunnel/commit/312ee866fb|2.3.9| |[Improve][Jdbc] Support postgresql inet type. (#7820)|https://github.com/apache/seatunnel/commit/25b68b3623|2.3.9| |[Fix][Connector-V2]Oceanbase vector database is added as the source server (#7832)|https://github.com/apache/seatunnel/commit/258f931765|2.3.9| |[Feature][connector-v2]Support opengauss jdbc connnector using opengauss driver. (#7622)|https://github.com/apache/seatunnel/commit/bbf643772e|2.3.9| |[Improve][Jdbc] Support save mode for the sink of jdbc-dm (#7814)|https://github.com/apache/seatunnel/commit/b87d732c81|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] SqlServer support user-defined type (#7706)|https://github.com/apache/seatunnel/commit/fb89033273|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Feature][Connector-Paimon] Support dynamic bucket splitting improves Paimon writing efficiency (#7335)|https://github.com/apache/seatunnel/commit/bc0326cba8|2.3.8| |[Fix][Connector-V2] Fix jdbc test case failed (#7690)|https://github.com/apache/seatunnel/commit/4f5d27f625|2.3.8| |[Improve][Jdbc] Jdbc truncate table should check table not database (#7654)|https://github.com/apache/seatunnel/commit/0c0eb7e41b|2.3.8| |[Feature][Connector-V2] jdbc saphana source tablepath support view and synonym (#7670)|https://github.com/apache/seatunnel/commit/7e0c20a488|2.3.8| |[Fix][Connector-v2] Throw Exception in sql query for JdbcCatalog in table or db exists query (#7651)|https://github.com/apache/seatunnel/commit/70ec59ce0e|2.3.8| |[Fix][JDBC] Fix starrocks jdbc dialect catalog conflict with starrocks connector (#7578)|https://github.com/apache/seatunnel/commit/020aab422e|2.3.8| |[Feature] Support tidb cdc connector source #7199 (#7477)|https://github.com/apache/seatunnel/commit/87ec786bd6|2.3.8| |[bugfix] fix oracle query table length (#7627)|https://github.com/apache/seatunnel/commit/2e002ce09b|2.3.8| |[Hotfix][Connector-v2] Fix the NullPointerException for jdbc oracle which used the table_list (#7544)|https://github.com/apache/seatunnel/commit/555028217a|2.3.8| |[Improve][Connector-v2] Support mysql 8.1/8.2/8.3 for jdbc (#7530)|https://github.com/apache/seatunnel/commit/657fe69b26|2.3.8| |[Improve][Connector-v2] Release resource in closeStatements even exception occurred in executeBatch (#7533)|https://github.com/apache/seatunnel/commit/590f7d110d|2.3.8| |[Fix][Connector-V2] Fix jdbc query sql can not get table path (#7484)|https://github.com/apache/seatunnel/commit/8e0ca8f725|2.3.8| |[Feature][Connector-V2] Add `decimal_type_narrowing` option in jdbc (#7461)|https://github.com/apache/seatunnel/commit/696f2948fa|2.3.8| |[Improve][Connector-V2] update vectorType (#7446)|https://github.com/apache/seatunnel/commit/1bba72385b|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[FIX][E2E]Modify the OceanBase test case to the latest imageChange image (#7452)|https://github.com/apache/seatunnel/commit/6abb83deab|2.3.8| |[Feature][Connector-V2][OceanBase] Support vector types on OceanBase (#7375)|https://github.com/apache/seatunnel/commit/a6b188d552|2.3.8| |[Improve][Connector-V2] Remove system table limit (#7391)|https://github.com/apache/seatunnel/commit/adf888e008|2.3.8| |[Fix] Fix oracle sample data from column error (#7340)|https://github.com/apache/seatunnel/commit/2130e0d5ad|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Hotifx][Jdbc] Fix MySQL unsupport 'ZEROFILL' column type (#7407)|https://github.com/apache/seatunnel/commit/7130382123|2.3.8| |[Improvement] add starrocks jdbc dialect (#7294)|https://github.com/apache/seatunnel/commit/b5140f598e|2.3.8| |[Hotfix][Connector] Fix jdbc compile error (#7359)|https://github.com/apache/seatunnel/commit/2769ed5029|2.3.7| |[Fix][Connector-V2][OceanBase] Remove OceanBase catalog's dependency on mysql driver (#7311)|https://github.com/apache/seatunnel/commit/3130ae089e|2.3.7| |[Improve][Jdbc] Skip all index when auto create table to improve performance of write (#7288)|https://github.com/apache/seatunnel/commit/dc3c23981b|2.3.7| |[Improve][Jdbc] Remove MysqlType references in JdbcDialect (#7333)|https://github.com/apache/seatunnel/commit/16eeb1c123|2.3.7| |[Improve][Jdbc] Merge user config primary key when create table (#7313)|https://github.com/apache/seatunnel/commit/819c685651|2.3.7| |[Improve][Connector-v2] Optimize the way of databases and tables are checked for existence (#7261)|https://github.com/apache/seatunnel/commit/f012b2a6f0|2.3.7| |[Feature][Jdbc] Support hive compatibleMode add inceptor dialect (#7262)|https://github.com/apache/seatunnel/commit/31e59cdf82|2.3.6| |[Improve][Connector-v2] Optimize the count table rows for jdbc-oracle and oracle-cdc (#7248)|https://github.com/apache/seatunnel/commit/0d08b20061|2.3.6| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix] Fix Hana type converter decimal scale is 0 convert to int error (#7167)|https://github.com/apache/seatunnel/commit/6e33a97c86|2.3.6| |[Improve][Jdbc] Support write unicode text into sqlserver (#7159)|https://github.com/apache/seatunnel/commit/e44e8b93bc|2.3.6| |[Improve][Jdbc] Remove user info in catalog-table options (#7178)|https://github.com/apache/seatunnel/commit/4e001be25c|2.3.6| |[Improve][connector-v2-jdbc-mysql] Add support for MySQL 8.4 (#7151)|https://github.com/apache/seatunnel/commit/dbdbdf015b|2.3.6| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Improve] Change catalog table log to debug level (#7136)|https://github.com/apache/seatunnel/commit/b111d2f843|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[connector-jdbc][bugfix] fix sqlServer create table comment special string bug (#7024)|https://github.com/apache/seatunnel/commit/403564db13|2.3.6| |[bugfix] fix pgsql create table comment special string bug (#7022)|https://github.com/apache/seatunnel/commit/9fe844f62a|2.3.6| |[connector-jdbc][bugfix] fix oracle create table comment special string bug (#7012)|https://github.com/apache/seatunnel/commit/a9e0f67873|2.3.6| |[bugfix] fix mysql create table comment special string bug (#6998)|https://github.com/apache/seatunnel/commit/904e9cf785|2.3.6| |[Improve][[Jdbc]sink sql support custom field.(#6515) (#6525)|https://github.com/apache/seatunnel/commit/ef3e61dbc4|2.3.6| |[Feature][Jdbc] Support redshift catalog (#6992)|https://github.com/apache/seatunnel/commit/8d5cbcee74|2.3.6| |[Improve][Connector-V2] Clean key name in catalog table (#6942)|https://github.com/apache/seatunnel/commit/a399ef48c6|2.3.6| |[Improve][Zeta] Move SaveMode behavior to master (#6843)|https://github.com/apache/seatunnel/commit/80cf91318d|2.3.6| |[Improve][Jdbc] Quotes the identifier for table path (#6951)|https://github.com/apache/seatunnel/commit/d70ec61f35|2.3.6| |[Hotfix][Jdbc] Fix oracle savemode create table (#6651)|https://github.com/apache/seatunnel/commit/4b6c13e8fc|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve][Connector] Add some sqlserver IDENTITY type for catalog (#6822)|https://github.com/apache/seatunnel/commit/f698396555|2.3.6| |[Feature][Jdbc] Support the jdbc connector for InterSystems IRIS (#6797)|https://github.com/apache/seatunnel/commit/46600969bb|2.3.6| |[Fix][MySQL]: Fix MySqlTypeConverter could not be instantiated (#6781)|https://github.com/apache/seatunnel/commit/a5609d600e|2.3.6| |[Hotfix][Jdbc] Fix table/query columns order merge for jdbc catalog (#6771)|https://github.com/apache/seatunnel/commit/df1954d520|2.3.6| |[Fix] Fix Oracle type converter handle negative scale in number type (#6758)|https://github.com/apache/seatunnel/commit/6d710690c5|2.3.6| |[Improve][mysql-cdc] Support mysql 5.5 versions (#6710)|https://github.com/apache/seatunnel/commit/058f5594a3|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Improve][Jdbc] Increase tyepe converter when auto creating tables (#6617)|https://github.com/apache/seatunnel/commit/cc660206d8|2.3.5| |[feature][connector-v2] add xugudb connector (#6561)|https://github.com/apache/seatunnel/commit/80f392afbb|2.3.5| |[Hotfix] Fix DEFAULT TABLE problem (#6352)|https://github.com/apache/seatunnel/commit/cdb1856e84|2.3.5| |[Improve] Improve MultiTableSinkWriter prepare commit performance (#6495)|https://github.com/apache/seatunnel/commit/2086b0e8a6|2.3.5| |[Improve][JDBC] Optimized code style for getting jdbc field types (#6583)|https://github.com/apache/seatunnel/commit/ddca95f32c|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Jdbc] Support custom case-sensitive config for dameng (#6510)|https://github.com/apache/seatunnel/commit/d6dcb03bf3|2.3.5| |feat: jdbc support copy in statement. (#6443)|https://github.com/apache/seatunnel/commit/ca4a65fc00|2.3.5| |[Improve][Jdbc] Using varchar2 datatype store string in oracle (#6392)|https://github.com/apache/seatunnel/commit/14405fa8d4|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |Fix Jdbc sink target table name error (#6269)|https://github.com/apache/seatunnel/commit/2f62235e38|2.3.4| |[Improve][JDBC] Use PreparedStatement to sample data from column (#6242)|https://github.com/apache/seatunnel/commit/bd0e66d533|2.3.4| |[Improve][JDBC-sink] Improve query Approximate Total Row Count of a Table (#5972)|https://github.com/apache/seatunnel/commit/8156036a2f|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve] Support `int identity` type in sql server (#6186)|https://github.com/apache/seatunnel/commit/1a8da1c843|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |Add date type and float type column split support (#6160)|https://github.com/apache/seatunnel/commit/b9a62e5c3f|2.3.4| |[Improve] Extend `SupportResourceShare` to spark/flink (#5847)|https://github.com/apache/seatunnel/commit/c69da93b87|2.3.4| |[Feature] Support `uuid` in postgres jdbc (#6185)|https://github.com/apache/seatunnel/commit/f56855098b|2.3.4| |[Feature][Connector-V2][Oracle-cdc]Support for oracle cdc (#5196)|https://github.com/apache/seatunnel/commit/aaef22b31b|2.3.4| |[Feature][Connector] update pgsql catalog for save mode (#6080)|https://github.com/apache/seatunnel/commit/84ce516929|2.3.4| |[Hotfix][Jdbc] Fix dameng catalog query table sql (#6141)|https://github.com/apache/seatunnel/commit/413fa74500|2.3.4| |[improve][catalog-postgres] Improve get column sql compatibility (#5664)|https://github.com/apache/seatunnel/commit/23ce592ad2|2.3.4| |[Feature][Connector] update oracle catalog for save mode (#6092)|https://github.com/apache/seatunnel/commit/dfbf92769c|2.3.4| |[Feature][Connectors-V2][Jdbc] Supports Sqlserver Niche Data Types (#6122)|https://github.com/apache/seatunnel/commit/6673f6f771|2.3.4| |[Improve][Connector-V2][Jdbc] Shade hikari in jdbc connector (#6116)|https://github.com/apache/seatunnel/commit/dd698c95bf|2.3.4| |[Feature][Connector] update sqlserver catalog for save mode (#6086)|https://github.com/apache/seatunnel/commit/edcaacecb1|2.3.4| |[Feature][Connector-V2][PostgresSql] add JDBC source support string type as partition key (#6079)|https://github.com/apache/seatunnel/commit/3522eb157c|2.3.4| |[Hotfix][Jdbc] Fix jdbc setFetchSize error (#6005)|https://github.com/apache/seatunnel/commit/d41af8a6ed|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Hotfix][Split] Fix split key not support BigInteger type|https://github.com/apache/seatunnel/commit/5adf5d2b9a|2.3.4| |[Improve] Replace SeaTunnelRowType with TableSchema in the JdbcRowConverter|https://github.com/apache/seatunnel/commit/1cc1b1b8cd|2.3.4| |[Hotfix][Jdbc] Fix cdc updates were not filtering same primary key (#5923)|https://github.com/apache/seatunnel/commit/38d3b85814|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Bug] Fix Hive-Jdbc use krb5 overwrite kerberosKeytabPath (#5891)|https://github.com/apache/seatunnel/commit/f0b6092c15|2.3.4| |Reduce the time cost of getCatalogTable in jdbc (#5908)|https://github.com/apache/seatunnel/commit/51a3737578|2.3.4| |[Improve] Improve Jdbc connector error message when datatype unsupported (#5864)|https://github.com/apache/seatunnel/commit/69f79af3a4|2.3.4| |[Improve] Rename `getCountSql` to `getExistDataSql` (#5838)|https://github.com/apache/seatunnel/commit/2233b3a381|2.3.4| |[Fix] Fix read from Oracle Date type value lose time (#5814)|https://github.com/apache/seatunnel/commit/2d704e36bd|2.3.4| |[Improve][JdbcSource] Optimize catalog-table metadata merge logic (#5828)|https://github.com/apache/seatunnel/commit/7d8028a60b|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Hive JDBC Source] Support Hive JDBC Source Connector (#5424)|https://github.com/apache/seatunnel/commit/a64e177d06|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[Feature][Oracle] Support XMLTYPE data integration #5716 (#5723)|https://github.com/apache/seatunnel/commit/620f081adb|2.3.4| |[Fix] Fix Postgres create table test case failed (#5778)|https://github.com/apache/seatunnel/commit/b98b6bcee3|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[Fix] Fix PG will not create index when using auto create table #5721|https://github.com/apache/seatunnel/commit/e5fd88dbe7|2.3.4| |[Improve] Remove all useless `prepare`, `getProducedType` method (#5741)|https://github.com/apache/seatunnel/commit/ed94fffbb9|2.3.4| |[feature][connector-jdbc]Add Save Mode function and Connector-JDBC (MySQL) connector has been realized (#5663)|https://github.com/apache/seatunnel/commit/eff17ccbe5|2.3.4| |[Bug] [connector-jdbc] Nullable Column source have null data could be unexpected results. (#5560)|https://github.com/apache/seatunnel/commit/3f429e1f0a|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[BUG][Connector-V2][Jdbc] support postgresql xml type (#5724)|https://github.com/apache/seatunnel/commit/5f5d4da13f|2.3.4| |[Improve][E2E][Jdbc] Enable IT case for Oceanbase Mysql mode (#5697)|https://github.com/apache/seatunnel/commit/879c2aa07c|2.3.4| |[Feature][Jdbc] Support read multiple tables (#5581)|https://github.com/apache/seatunnel/commit/33fa8ff248|2.3.4| |[Feature] Support multi-table sink (#5620)|https://github.com/apache/seatunnel/commit/81ac173189|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Feature][Jdbc] Supporting more ways to configure connection parameters. (#5388)|https://github.com/apache/seatunnel/commit/d31e9478f7|2.3.4| |[Feature][Connector-V2][Jdbc] Add OceanBase catalog (#5439)|https://github.com/apache/seatunnel/commit/cd4b7ff7d2|2.3.4| |[BUGFIX][Catalog] oracle catalog create table repeat and oracle pg null point (#5517)|https://github.com/apache/seatunnel/commit/103da931f3|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][Jdbc] Add Dameng catalog (#5451)|https://github.com/apache/seatunnel/commit/c23070919c|2.3.4| |[Feature] Add tidb datatype convertor (#5440)|https://github.com/apache/seatunnel/commit/61391bda9f|2.3.4| |[Feature][Connector-V2] jdbc connector supports Kingbase database (#4803)|https://github.com/apache/seatunnel/commit/9538567159|2.3.4| |[Feature][Catalog] Catalog add Case Conversion Definition (#5328)|https://github.com/apache/seatunnel/commit/7b5b28bdbe|2.3.4| |[Feature][Jdbc] Jdbc database support identifier (#5089)|https://github.com/apache/seatunnel/commit/38b6d6e4bb|2.3.4| |[Improve][Connector-v2][Jdbc] Refactor AbstractJdbcCatalog (#5096)|https://github.com/apache/seatunnel/commit/dde3104f76|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[bug][jdbc][oracle]Fix the Oracle number type mapping problem (#5209)|https://github.com/apache/seatunnel/commit/9d3c3de90d|2.3.3| |[BUG][Connector-V2][Jdbc] support postgresql json type (#5194)|https://github.com/apache/seatunnel/commit/7a862d14b7|2.3.3| |[Improve] [Connector-V2] Remove scheduler in JDBC sink #4736 (#5168)|https://github.com/apache/seatunnel/commit/3b0a393145|2.3.3| |[CI] Split updated modules integration test for part 5 (#5208)|https://github.com/apache/seatunnel/commit/18f14d6087|2.3.3| |[Bug] [connector-v2] PostgreSQL versions below 9.5 are compatible use cdc sync problem (#5120)|https://github.com/apache/seatunnel/commit/9af696a1dd|2.3.3| |[Improve][Connector-v2][Jdbc] check url not null throw friendly message (#5097)|https://github.com/apache/seatunnel/commit/b0815f2a95|2.3.3| |[Feature][Catalog] Add JDBC Catalog auto create table (#4917)|https://github.com/apache/seatunnel/commit/63eb137671|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Hotfix][Connector][Jdbc] Fix the problem of JdbcOutputFormat database connection leak (#4802)|https://github.com/apache/seatunnel/commit/4cc10e83e7|2.3.3| |[Feature][JDBC Sink] Add DM upsert support (#5073)|https://github.com/apache/seatunnel/commit/5e8d982e25|2.3.3| |[Improve] Improve savemode api (#4767)|https://github.com/apache/seatunnel/commit/4acd370d48|2.3.3| |[Feature][Connector-V2] JDBC source support string type as partition key (#4947)|https://github.com/apache/seatunnel/commit/d1d2677658|2.3.3| |[Feature][Connector-V2][Jdbc] Add oceanbase dialect factory (#4989)|https://github.com/apache/seatunnel/commit/7ba11cecdf|2.3.3| |Fix XA Transaction bug (#5020)|https://github.com/apache/seatunnel/commit/852fe104bc|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Improve][Connector-V2][Jdbc-Source] Support for Decimal types as splict keys (#4634)|https://github.com/apache/seatunnel/commit/d56bb1ba1c|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[Hotfix][Jdbc] Fix XA DataSource crash(Oracle/Dameng/SqlServer) (#4866)|https://github.com/apache/seatunnel/commit/bde19b6377|2.3.2| |[Feature][Connector-v2] Add Snowflake Source&Sink connector (#4470)|https://github.com/apache/seatunnel/commit/06c59a25f3|2.3.2| |[Hotfix][Connector-V2][Jdbc] Fix the error of extracting primary key column in sink (#4815)|https://github.com/apache/seatunnel/commit/0eff3aeed0|2.3.2| |[Hotfix][Connector][Jdbc] Fix reconnect throw close statement exception (#4801)|https://github.com/apache/seatunnel/commit/ea3bc1a673|2.3.2| |[Hotfix][Connector][Jdbc] Fix sqlserver system table case sensitivity (#4806)|https://github.com/apache/seatunnel/commit/2ca7426d22|2.3.2| |[Hotfix][Jdbc][Oracle] Fix oracle sql table identifier (#4754)|https://github.com/apache/seatunnel/commit/84cb51ff83|2.3.2| |[Improve][Jdbc] Populate primary key when jdbc sink is created using CatalogTable (#4755)|https://github.com/apache/seatunnel/commit/4af3bf9015|2.3.2| |[Feature][PostgreSQL-jdbc] Supports GEOMETRY data type for PostgreSQL… (#4673)|https://github.com/apache/seatunnel/commit/a5af4d9b6e|2.3.2| |[Improve][Core] Add check of sink and source config to avoid null pointer exception. (#4734)|https://github.com/apache/seatunnel/commit/8f66ce96cb|2.3.2| |[Hotfix][JDBC-SINK] Fix TiDBCatalog without open (#4718)|https://github.com/apache/seatunnel/commit/34a7f3eaa4|2.3.2| |[Feature][E2E] Add mysql-cdc e2e testcase (#4639)|https://github.com/apache/seatunnel/commit/87001dfd16|2.3.2| |[Hotfix][JDBC Sink] Fix JDBC Sink oom bug (#4690)|https://github.com/apache/seatunnel/commit/08b6f992aa|2.3.2| |Improve the option rule for jdbc sink (#4694)|https://github.com/apache/seatunnel/commit/a6b3704414|2.3.2| |[feature][catalog] Support for multiplexing connections (#4550)|https://github.com/apache/seatunnel/commit/41277d7f78|2.3.2| |[Bugfix][Jdbc-Mysql Mysql-CDC] Fix MySQL BIT type incorrectly converted to Boolean type (#4671)|https://github.com/apache/seatunnel/commit/89b0099ff4|2.3.2| |[Hotfix][Jdbc[SqlServer] Fix sqlserver jdbc url parse (#4697)|https://github.com/apache/seatunnel/commit/b24c3226ec|2.3.2| |Revert "[Improve][Catalog] refactor catalog (#4540)" (#4628)|https://github.com/apache/seatunnel/commit/2d1933195d|2.3.2| |[Feature][Connector][Jdbc] Add DataTypeConvertor for JDBC-Postgres (#4575)|https://github.com/apache/seatunnel/commit/91f5125976|2.3.2| |[Improve][Catalog] refactor catalog (#4540)|https://github.com/apache/seatunnel/commit/b0a701cb83|2.3.2| |[Bug] [JDBC Source] fix split exception when source table is empty (#4570)|https://github.com/apache/seatunnel/commit/c73b9331ce|2.3.2| |[Feature][Connector][Jdbc] Add vertica connector. (#4303)|https://github.com/apache/seatunnel/commit/e6b4f98721|2.3.2| |[Hotfix][Catalog] Filter out unavailable constrain keys (#4557)|https://github.com/apache/seatunnel/commit/5e5859546a|2.3.2| |[Hotfix][Connector-V2][Jdbc] Simple sql has the highest priority (#4548)|https://github.com/apache/seatunnel/commit/74d4d24858|2.3.2| |[Improve][Connector-V2][Jdbc] Jdbc source supports factory SPI (#4264)|https://github.com/apache/seatunnel/commit/a97f33797d|2.3.2| |[Jdbc][Chore] improve the exception message when primary key not found in row (#4474)|https://github.com/apache/seatunnel/commit/06fa850da9|2.3.2| |[hotfix][JDBC] Fix the table name is not automatically obtained when multiple tables (#4514)|https://github.com/apache/seatunnel/commit/c84d6f8d11|2.3.2| |[Chore][Jdbc] add the log for sql and update some style (#4475)|https://github.com/apache/seatunnel/commit/a9e6503045|2.3.2| |[Hotfix][Connector-V2][Jdbc] Set default value to false of JdbcOption: generate_sink_sql (#4471)|https://github.com/apache/seatunnel/commit/7da11c2f44|2.3.2| |[feature][jdbc][TiDB] add TiDB catalog (#4438)|https://github.com/apache/seatunnel/commit/9a32db6fc0|2.3.2| |[Hotfix][Connector] Fix sqlserver catalog (#4441)|https://github.com/apache/seatunnel/commit/8540c7f9f3|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][JdbcSink]Fix connection failure caused by connection timeout. (#4322)|https://github.com/apache/seatunnel/commit/e1f6d3b3fd|2.3.2| |[Hotfix][Connector-V2][Jdbc] Field aliases are not supported in the query of jdbc source. (#4158) (#4210)|https://github.com/apache/seatunnel/commit/3d7ff831f9|2.3.1| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Change redshift type to lowercase (#4248)|https://github.com/apache/seatunnel/commit/10447ae103|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[hotfix] fixed jdbc IT error|https://github.com/apache/seatunnel/commit/dd20af0a9e|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][jdbc] use ReadonlyConfig instead of Config (#4236)|https://github.com/apache/seatunnel/commit/c90c58e243|2.3.1| |[Improve][Jdbc-sink] add database field to sink config (#4199)|https://github.com/apache/seatunnel/commit/ec368902f4|2.3.1| |[improve][jdbc] Reduce jdbc options configuration (#4218)|https://github.com/apache/seatunnel/commit/ddd8f808b5|2.3.1| |Fix mysql get default value (#4204)|https://github.com/apache/seatunnel/commit/6848434f2d|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Improve] Remove AUTO_COMMIT To Optional In JDBC OptionRule (#4194)|https://github.com/apache/seatunnel/commit/9d088017a3|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[improve][catalog][jdbc] Add MySQL catalog factory (#4168)|https://github.com/apache/seatunnel/commit/95e3cbf875|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |Add Kafka catalog (#4106)|https://github.com/apache/seatunnel/commit/34f1f21e48|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |Add DataTypeConvertor in Catalog (#4094)|https://github.com/apache/seatunnel/commit/840c3e5eb4|2.3.1| |[Feature] [Catalog] Support create/drop table, create/drop database in catalog (#4075)|https://github.com/apache/seatunnel/commit/d8a0be84ca|2.3.1| | [Bug][Connector-V2][Jdbc] Fixed no exception throwing problem (#3957)|https://github.com/apache/seatunnel/commit/6ab266e594|2.3.1| |[Bug][CDC] Fix jdbc sink generate update sql (#3940)|https://github.com/apache/seatunnel/commit/233465d4e4|2.3.1| |[Improve][JDBC] improve jdbc sink option (#3864)|https://github.com/apache/seatunnel/commit/768a9300e8|2.3.1| |Fix Source Class Support Parallelism judge & Add UT for it (#3878)|https://github.com/apache/seatunnel/commit/ce85a8c68b|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][Connector-V2] Jdbc connector support SAP HANA. (#3017)|https://github.com/apache/seatunnel/commit/fe0180fab2|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][JDBC Connector]improve option rule (#3802)|https://github.com/apache/seatunnel/commit/139256741a|2.3.1| |[Hotfix][Jdbc Sink] fix xa transaction commit failure on pipeline restore (#3809)|https://github.com/apache/seatunnel/commit/39dae4cfd9|2.3.1| |[Improve][Connector-V2][JDBC] Add exactly-once for JDBC source connector (#3750)|https://github.com/apache/seatunnel/commit/5328e9d847|2.3.1| |[Improve][Connector-v2] Remove unused options for jdbc source factory (#3794)|https://github.com/apache/seatunnel/commit/861004d309|2.3.1| |[Feature][Connector-jdbc] Fix JDBC Connector Throw Exception Error. (#3796)|https://github.com/apache/seatunnel/commit/38646b11b8|2.3.1| |[hotfix][ST-Engine] fix jdbc connector exactly-once null pointer (#3730)|https://github.com/apache/seatunnel/commit/0c5986fbec|2.3.0| |[Improve][connector-jdbc] Add config item enable upsert by query (#3708)|https://github.com/apache/seatunnel/commit/e1f951f782|2.3.0| |[Hotfix][connector-v2] fix SemanticXidGenerator#generateXid indexOutOfBounds #3701 (#3705)|https://github.com/apache/seatunnel/commit/f351ceaf4b|2.3.0| |[Hotfix][Connector-V2][jdbc] fix jdbc connection reset bug (#3670)|https://github.com/apache/seatunnel/commit/6fe0e6aece|2.3.0| |[Improve][Connector-V2][JDBC] Unified exception for JDBC source & sink (#3598)|https://github.com/apache/seatunnel/commit/865ca2bba9|2.3.0| |[Connector][JDBC]Support Redshift sink and source (#3615)|https://github.com/apache/seatunnel/commit/8d9d8638d2|2.3.0| |[Improve][Connectors-V2][jdbc] Adapts to multiple versions of Flink #3589|https://github.com/apache/seatunnel/commit/e77fdbbef7|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Doris]Add Doris Source & Sink connector (#3586)|https://github.com/apache/seatunnel/commit/3d46b79614|2.3.0| |[Feature][Connector-V2][Teradata] Add Teradata Source And Sink Connector|https://github.com/apache/seatunnel/commit/3a095d30fd|2.3.0| |[Feature][Connector-V2][JDBC] support sqlite Source & Sink (#3089)|https://github.com/apache/seatunnel/commit/a73bb3e714|2.3.0| |Bump postgresql in /seatunnel-connectors-v2/connector-jdbc (#3559)|https://github.com/apache/seatunnel/commit/c8dfdf3e46|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[JDBC] [ORACLE] Improve Oracle Type to SeaTunnel Type Mapping (#3486)|https://github.com/apache/seatunnel/commit/8fe0dda6e2|2.3.0| |[JDBC] [Config] Add JDBC Fetch Size Config And Custom Postgres PrepareStatement (#3478)|https://github.com/apache/seatunnel/commit/d60a705f5d|2.3.0| |[feature][connector][jdbc] expose configurable options in JDBC (#3410)|https://github.com/apache/seatunnel/commit/72b8a73cab|2.3.0| |[feature][connector][jdbc] Support write cdc changelog event in jdbc sink (#3444)|https://github.com/apache/seatunnel/commit/b12a908f01|2.3.0| |[Improve][Connector-v2][Jdbc] Add AutoCommit to jdbcConfig (#3453)|https://github.com/apache/seatunnel/commit/cfb1e97853|2.3.0| |[Improve][Connector-v2] Unset AutoCommit default to true (#3451)|https://github.com/apache/seatunnel/commit/439f686d92|2.3.0| |[Feature][connector-v2] add tablestore source and sink (#3309)|https://github.com/apache/seatunnel/commit/ebebf0b633|2.3.0| |Close jdbc connection after use. (#3358)|https://github.com/apache/seatunnel/commit/219fea517c|2.3.0| |[Improve] [Engine] Improve Engine performance. (#3216)|https://github.com/apache/seatunnel/commit/7393c47327|2.3.0| |[Bug][Connector-V2][JDBC]fix jdbc split bug (#3220)|https://github.com/apache/seatunnel/commit/40d67ab902|2.3.0| |[Feature][Connector-V2][JDBC] Support DB2 Source & Sink (#2410)|https://github.com/apache/seatunnel/commit/bf1ef69e84|2.3.0| |update org.postgresql:postgresql 42.3.3 to 42.4.1 (#3097)|https://github.com/apache/seatunnel/commit/2852516490|2.3.0| |[Feature][Connector-V2][Jdbc] support gbase 8a (#3026)|https://github.com/apache/seatunnel/commit/dc6e85d06f|2.3.0-beta| |[Bug] [sqlserver] timestamp convert exception (#3024)|https://github.com/apache/seatunnel/commit/99ac1a655e|2.3.0-beta| |[Feature][Connector-V2] oracle connector (#2550)|https://github.com/apache/seatunnel/commit/384ece1913|2.3.0-beta| |[Improve][Connector-v2][jdbc] Support for specify number of partitions when parallel reading (#2950)|https://github.com/apache/seatunnel/commit/fc284ac32e|2.3.0-beta| |[Feature][Connector-V2] add sqlserver connector (#2646)|https://github.com/apache/seatunnel/commit/05d105dea3|2.3.0-beta| |[Improve][e2e] Unified e2e IT for DaMengDB (#2946)|https://github.com/apache/seatunnel/commit/15636bdea1|2.3.0-beta| |[Improve][e2e] modify DM-driver by downLoad and add the value comparison of all columns (#2772)|https://github.com/apache/seatunnel/commit/f3ff39bdfe|2.3.0-beta| |[Improve][e2e] Improve jdbc driver management (#2770)|https://github.com/apache/seatunnel/commit/f907927a35|2.3.0-beta| |[hotfix][connector][jdbc] fix JDBC split exception (#2904)|https://github.com/apache/seatunnel/commit/57342c6545|2.3.0-beta| |[Improve][connector-jdbc] Calculate splits only once in JdbcSourceSplitEnumerator (#2900)|https://github.com/apache/seatunnel/commit/7622f28999|2.3.0-beta| |[Feature] [Connector-V2 E2E] Add mysql and postgres e2e test and bug fix (#2838)|https://github.com/apache/seatunnel/commit/db434adc15|2.2.0-beta| |fix XAConnection being wrongly submitted (#2805)|https://github.com/apache/seatunnel/commit/d9a6039fd3|2.2.0-beta| |fix spark execute exception is not thrown (#2791)|https://github.com/apache/seatunnel/commit/b1711c984e|2.2.0-beta| |[Improve][e2e] Add driver-jar to lib (#2719)|https://github.com/apache/seatunnel/commit/d64d452c86|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Connector-V2][JDBC-connector] support Jdbc dm (#2377)|https://github.com/apache/seatunnel/commit/7278209ca2|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Bug] [connector-jdbc-v2] Fix transaction force commit when autoCommit is enabled (#2636)|https://github.com/apache/seatunnel/commit/8cd8cf7aa2|2.2.0-beta| | [Feature][Connector-V2] Add phoenix connector sink (#2499)|https://github.com/apache/seatunnel/commit/05ccf9d68c|2.2.0-beta| |[Connector-V2][JDBC] Support database: greenplum (#2429)|https://github.com/apache/seatunnel/commit/3561d3878f|2.2.0-beta| |Add jdbc connector e2e test (#2321)|https://github.com/apache/seatunnel/commit/5fbcb811c6|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |update the condition to 1 = 0 about get table operation (#2186)|https://github.com/apache/seatunnel/commit/7c56d7143b|2.2.0-beta| |[SeaTunnel API] [Sink] remove useless context field (#2124)|https://github.com/apache/seatunnel/commit/a31fdeedcc|2.2.0-beta| |[bugfix] Check isOpen before closing (#2107)|https://github.com/apache/seatunnel/commit/7ec0ada2b9|2.2.0-beta| |[API-DRAFT] [MERGE] fix merge error|https://github.com/apache/seatunnel/commit/3c0e984648|2.2.0-beta| |merge dev to api-draft|https://github.com/apache/seatunnel/commit/d265597c64|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-kafka.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-V2] Optimize start mode of kafka recovery job (#9736)|https://github.com/apache/seatunnel/commit/bbde7f6339|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix][Connector-V2] Add Filter for Partitions to Prevent Blocking in KafkaConsumer StreamMode (#9598)|https://github.com/apache/seatunnel/commit/bd24fa77cb|2.3.12| |[Fix][Connecotr-kafka] Fix kafka IllegalArgumentException when offset is -1 (#9376)|https://github.com/apache/seatunnel/commit/142aca7b70|2.3.12| |[Feature][Connectors-V2] Add end_timestamp for timstamp start mode (#9318)|https://github.com/apache/seatunnel/commit/68b0504da9|2.3.11| |[Bugifx][kafka] Fix kafka enumerator assign split NPE (#9220)|https://github.com/apache/seatunnel/commit/7ca0c0c7e4|2.3.11| | [Fix][Connector-V2] Fix kafka database name (#9201)|https://github.com/apache/seatunnel/commit/79d9a937ee|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] assign size for KafkaSource reader cache queue (#9041)|https://github.com/apache/seatunnel/commit/8a9db476bd|2.3.11| |[Feature][Kafka] Support native format read/write kafka record (#8724)|https://github.com/apache/seatunnel/commit/86e2d6fcfa|2.3.10| |[improve] update kafka source default schema from content<ROW<content STRING>> to content<STRING> (#8642)|https://github.com/apache/seatunnel/commit/db6e2994d4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] kafka connector options (#8616)|https://github.com/apache/seatunnel/commit/aadfe99f88|2.3.10| |[Fix] [Kafka Source] kafka source use topic as table name instead of fullName (#8401)|https://github.com/apache/seatunnel/commit/3d4f4bb33a|2.3.10| |[Feature][Kafka] Add `debezium_record_table_filter` and fix error (#8391)|https://github.com/apache/seatunnel/commit/b27a30a5aa|2.3.9| |[Bug][Kafka] kafka reads repeatedly (#8465)|https://github.com/apache/seatunnel/commit/f67f27279a|2.3.9| |[Hotfix][Connector-V2][kafka] fix kafka sink config exactly-once exception (#7857)|https://github.com/apache/seatunnel/commit/92b3253a5b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Kafka] Support custom topic for debezium compatible format (#8145)|https://github.com/apache/seatunnel/commit/deefe8762a|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Fix][Kafka] Fix in kafka streaming mode can not read incremental data (#7871)|https://github.com/apache/seatunnel/commit/a0eeeb9b62|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Fix][Connector-V2] Fix kafka `format_error_handle_way` not work (#7838)|https://github.com/apache/seatunnel/commit/63c7b4e9cc|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][kafka] Add arg poll.timeout for interval poll messages (#7606)|https://github.com/apache/seatunnel/commit/09d12fc40e|2.3.8| |[Improve][Kafka] kafka source refactored some reader read logic (#6408)|https://github.com/apache/seatunnel/commit/10598b6aec|2.3.8| |[Feature][connector-v2]Add Kafka Protobuf Data Parsing Support (#7361)|https://github.com/apache/seatunnel/commit/51c8e1a834|2.3.8| |[Hotfix][Connector] Fix kafka consumer log next startup offset (#7312)|https://github.com/apache/seatunnel/commit/891652399e|2.3.7| |[Fix][Connector kafka]Fix Kafka consumer stop fetching after TM node restarted (#7233)|https://github.com/apache/seatunnel/commit/7dc3fa8a13|2.3.6| |[Fix][Connector-V2] Fix kafka batch mode can not read all message (#7135)|https://github.com/apache/seatunnel/commit/1784c01a35|2.3.6| |[Feature][connector][kafka] Support read Maxwell format message from kafka #4415 (#4428)|https://github.com/apache/seatunnel/commit/4281b867ac|2.3.6| |[Hotfix][Connector-V2][kafka]Kafka consumer group automatically commits offset logic error fix (#6961)|https://github.com/apache/seatunnel/commit/181f01ee52|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Fix][Kafka-Sink] fix kafka sink factory option rule (#6657)|https://github.com/apache/seatunnel/commit/37578e103f|2.3.5| |[Feature][Connector-V2] Remove useless code for kafka connector (#6157)|https://github.com/apache/seatunnel/commit/0f286d1627|2.3.4| |[Feature] support avro format (#5084)|https://github.com/apache/seatunnel/commit/93a006156d|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][formats][ogg] Support read ogg format message #4201 (#4225)|https://github.com/apache/seatunnel/commit/7728e241e8|2.3.4| |[Improve] Remove all useless `prepare`, `getProducedType` method (#5741)|https://github.com/apache/seatunnel/commit/ed94fffbb9|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |KafkaSource use Factory to create source (#5635)|https://github.com/apache/seatunnel/commit/1c6176e518|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Connector-V2] connector-kafka source support data conversion extracted by kafka connect source (#4516)|https://github.com/apache/seatunnel/commit/bd74989099|2.3.3| |[Feature][connector][kafka] Support read debezium format message from kafka (#5066)|https://github.com/apache/seatunnel/commit/53a1f0c6c1|2.3.3| |[hotfix][kafka] Fix the problem that the partition information cannot be obtained when kafka is restored (#4764)|https://github.com/apache/seatunnel/commit/c203ef5f8d|2.3.2| |Fix the processing bug of abnormal parsing method of kafkaSource format. (#4687)|https://github.com/apache/seatunnel/commit/228257b2e2|2.3.2| |[hotfix][e2e][kafka] Fix the job not stopping (#4600)|https://github.com/apache/seatunnel/commit/93471c9ade|2.3.2| |[Improve][connector][kafka] Set default value for partition option (#4524)|https://github.com/apache/seatunnel/commit/884f733c3d|2.3.2| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| |[Feature][API] Add options check before create source and sink and transform in FactoryUtil (#4424)|https://github.com/apache/seatunnel/commit/38f1903be2|2.3.2| |[Feature][Connector-V2][Kafka] Kafka source supports data deserialization failure skipping (#4364)|https://github.com/apache/seatunnel/commit/e1ed22b153|2.3.2| |[Bug][Connector-v2][KafkaSource]Fix KafkaConsumerThread exit caused by commit offset error. (#4379)|https://github.com/apache/seatunnel/commit/71f4d0c784|2.3.2| |[Bug][Connector-v2][KafkaSink]Fix the permission problem caused by client.id. (#4246)|https://github.com/apache/seatunnel/commit/3cdb7cfa4d|2.3.2| |Fix KafkaProducer resources have never been released. (#4302)|https://github.com/apache/seatunnel/commit/f99f02caa2|2.3.2| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |[Hotfix][Zeta] Fix TaskExecutionService Deploy Failed The Job Can't Stop (#4265)|https://github.com/apache/seatunnel/commit/cf55b070bb|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Improve]]Connector-V2\[Kafka] Set kafka consumer default group (#4271)|https://github.com/apache/seatunnel/commit/82c784a3ef|2.3.1| |[chore] Fix the words of `canal` & `kafka` (#4261)|https://github.com/apache/seatunnel/commit/077a8d27a7|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |Add Kafka catalog (#4106)|https://github.com/apache/seatunnel/commit/34f1f21e48|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| | [Feature][Json-format][canal] Support read canal format message (#3950)|https://github.com/apache/seatunnel/commit/b80be72c85|2.3.1| |[Improve][Connector-V2][Kafka] Support extract topic from SeaTunnelRow field (#3742)|https://github.com/apache/seatunnel/commit/8aff807305|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Hotfix][Connector-V2][Kafka] Fix the bug that kafka consumer is not close. (#3836)|https://github.com/apache/seatunnel/commit/3447266427|2.3.1| |fix commit kafka offset bug. (#3933)|https://github.com/apache/seatunnel/commit/e60ad938be|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Bug][KafkaSource]Fix the default value of commit_on_checkpoint. (#3831)|https://github.com/apache/seatunnel/commit/df969849f6|2.3.1| |[Bug][KafkaSource]Failed to parse offset format (#3810)|https://github.com/apache/seatunnel/commit/8e1196accf|2.3.1| |[Improve] [Connector-V2] Kafka client user configured clientid is preferred (#3783)|https://github.com/apache/seatunnel/commit/aacf0abc04|2.3.1| |[Improve] [Connector-V2] Fix Kafka sink can't run EXACTLY_ONCE semantics (#3724)|https://github.com/apache/seatunnel/commit/5e3f196e29|2.3.0| |[Improve] [Connector-V2] fix kafka admin client can't get property config (#3721)|https://github.com/apache/seatunnel/commit/74c3351700|2.3.0| |[Improve][Connector-V2][Kafka] Add text format for kafka sink connector (#3711)|https://github.com/apache/seatunnel/commit/74bbd76b65|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Kafka]Unified exception for Kafka source and sink connector (#3574)|https://github.com/apache/seatunnel/commit/3b573798db|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[Improve][Connector-V2-kafka] Support for dynamic discover topic & partition in streaming mode (#3125)|https://github.com/apache/seatunnel/commit/999cfd6069|2.3.0| |[Improve][Connector-V2][Kafka] Support to specify multiple partition keys (#3230)|https://github.com/apache/seatunnel/commit/f65f44f44c|2.3.0| |[Feature][Connector-V2][Kafka] Add Kafka option rules (#3388)|https://github.com/apache/seatunnel/commit/cc0cb8cdb8|2.3.0| |[Improve][Connector-V2][Kafka]Improve kafka metadata code format (#3397)|https://github.com/apache/seatunnel/commit/379da3097f|2.3.0| |[Improve][Connector-V2-kafka] Support setting read starting offset or time at startup config (#3157)|https://github.com/apache/seatunnel/commit/3da19d4444|2.3.0| |update (#3150)|https://github.com/apache/seatunnel/commit/2b44992750|2.3.0-beta| |[Feature][connectors-v2][kafka] Kafka supports custom schema #2371 (#2783)|https://github.com/apache/seatunnel/commit/6506e306eb|2.3.0-beta| |[feature][connector][kafka] Support extract partition from SeaTunnelRow fields (#3085)|https://github.com/apache/seatunnel/commit/385e1f42c0|2.3.0-beta| |[Improve][connector][kafka] sink support custom partition (#3041)|https://github.com/apache/seatunnel/commit/ebddc18c41|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Imporve][Connector-V2]Parameter verification for connector V2 kafka sink (#2866)|https://github.com/apache/seatunnel/commit/254223fdb9|2.3.0-beta| |[Connector-V2] [Kafka] Fix Kafka Streaming problem (#2759)|https://github.com/apache/seatunnel/commit/e92e7b7283|2.2.0-beta| |[Improve][Connector-V2] Fix kafka connector (#2745)|https://github.com/apache/seatunnel/commit/90ce3851db|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-kudu.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature][connector-kudu] implement the filter (#9405)|https://github.com/apache/seatunnel/commit/2714dd1105|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] kudu options (#9162)|https://github.com/apache/seatunnel/commit/e7edafdbac|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Transform] Rename sql transform table name from 'fake' to 'dual' (#8298)|https://github.com/apache/seatunnel/commit/e6169684fb|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |correct the typo of kudu kerberos config (#6905)|https://github.com/apache/seatunnel/commit/fcb8554972|2.3.6| |[Fix][KuduCatalogFactory]: Fix KuduCatalogFactory.optionRule() will throw an Exception (#6787)|https://github.com/apache/seatunnel/commit/45a4e1532d|2.3.6| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Feature][Connector-V2] Support multi-table sink feature for kudu (#5951)|https://github.com/apache/seatunnel/commit/82460c0bf0|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Feature][Kudu] Support multi-table source read (#5878)|https://github.com/apache/seatunnel/commit/8d9a0b7d11|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on kudu (#5789)|https://github.com/apache/seatunnel/commit/10e791d60a|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Kudu] Refactor Kudu functionality and Sink support CDC data. (#5437)|https://github.com/apache/seatunnel/commit/22110eb7b3|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][Connector-V2] Fix connector source snapshot state NPE (#4027)|https://github.com/apache/seatunnel/commit/e39c4988cc|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve] [Connector-V2] Bad smell ToArrayCallWithZeroLengthArrayArgument: (#3577)|https://github.com/apache/seatunnel/commit/cc448d98c4|2.3.0| |[Improve][Connector-V2][Kudu] Unified exception for kudu source & sink connector (#3564)|https://github.com/apache/seatunnel/commit/273418ddc9|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[Feature][Connector V2] expose configurable options in Kudu (#3365)|https://github.com/apache/seatunnel/commit/c422210e2c|2.3.0| |[Feature][Core][Connector-V2] Unified The way of setting JobName (#2908)|https://github.com/apache/seatunnel/commit/bf2c97484b|2.3.0-beta| |remove duplicate ExceptionUtil class (#3037)|https://github.com/apache/seatunnel/commit/c9dc7c50c2|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2]Kudu Sink Connector Support to upsert row|https://github.com/apache/seatunnel/commit/1ece805ab1|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Connector-V2] Add Kudu source and sink connector (#2254)|https://github.com/apache/seatunnel/commit/0483cbc2df|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-lance.md ================================================
Change Log | Change | Commit | Version | |--------|--------|---------|
================================================ FILE: docs/en/connectors/changelog/connector-maxcompute.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Bug][Connector-V2] NoSuchMethodError caused by Netty version conflict on Spark 3.3.0 (#9632)|https://github.com/apache/seatunnel/commit/4d2b55ce3c|2.3.12| |[Improve][Connector-V2] Replace deprecated createDownloadSession by buildDownloadSession (#9555)|https://github.com/apache/seatunnel/commit/6862945eef|2.3.12| |[Improve][Connector-V2] Add tunnel_endpoint option to MaxCompute source for emulator test (#9548)|https://github.com/apache/seatunnel/commit/b3f3c527ca|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer upsert/delete action with upsert session mode (#9462)|https://github.com/apache/seatunnel/commit/eb9c8704b9|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] maxcompute options (#9163)|https://github.com/apache/seatunnel/commit/fdacbae1af|2.3.11| |[Fix][Connector-V2] Fix maxcompute write with multi parallelism (#9089)|https://github.com/apache/seatunnel/commit/9426b7ba2c|2.3.11| |[Fix][Connector-V2] Fix maxcompute sink write date less than actual date (#8999)|https://github.com/apache/seatunnel/commit/fc942a599b|2.3.11| |[Fix][Connector-V2] Fix maxcompute read with partition spec (#8896)|https://github.com/apache/seatunnel/commit/e62bf6c65c|2.3.10| |[Fix][Connector-V2] Fix MaxCompute cannot get project and tableName when use schema (#8865)|https://github.com/apache/seatunnel/commit/a24fa8fef6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support maxcompute source with multi-table (#8582)|https://github.com/apache/seatunnel/commit/0f78242923|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[Improve][Connector-V2] MaxComputeSink support create partition in savemode (#8474)|https://github.com/apache/seatunnel/commit/0b8f9de465|2.3.10| |[Improve][Transform] Rename sql transform table name from 'fake' to 'dual' (#8298)|https://github.com/apache/seatunnel/commit/e6169684fb|2.3.9| |[Feature][Connector-V2] Support MaxCompute save mode (#8277)|https://github.com/apache/seatunnel/commit/44ea675f1e|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix] Fix dead link on seatunnel connectors list url (#7453)|https://github.com/apache/seatunnel/commit/62b4f16f4e|2.3.8| |[BugFix][Connector-V2][Maxcompute]fix:Maxcompute sink can't map field(#7164) (#7168)|https://github.com/apache/seatunnel/commit/d5abf8f506|2.3.6| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |FakeSource support generate different CatalogTable for MultipleTable (#5766)|https://github.com/apache/seatunnel/commit/a8b93805ea|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[Improve][Test] Move MaxCompute test case file (#5786)|https://github.com/apache/seatunnel/commit/38132f5158|2.3.4| |[Fix] Fix MaxCompute use not exist SCHEMA option (#5708)|https://github.com/apache/seatunnel/commit/ba4782a67d|2.3.4| |[Feature] Support catalog in MaxCompute Source (#5283)|https://github.com/apache/seatunnel/commit/946d89cb95|2.3.4| |[Bugfix][Connector-V2][maxcompute] sink commit with Block not exsits on server (#4725)|https://github.com/apache/seatunnel/commit/2760cae73c|2.3.2| |[Bug] [Maxcompute] Fix failed to parse some maxcompute type (#3894)|https://github.com/apache/seatunnel/commit/642901f0a2|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Feature][Connector-V2][Maxcompute] Add Maxcompute source & sink connector (#3640)|https://github.com/apache/seatunnel/commit/80cf8f4e42|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-milvus.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Improve][Connector-milvus]update milvus-sdk-java to 2.5.11 (#9710)|https://github.com/apache/seatunnel/commit/08ebbaa8bd|2.3.12| |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][Connector-V2] Optimize Milvus doc and e2e test case (#9766)|https://github.com/apache/seatunnel/commit/e67466f73e|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] milvus options (#9165)|https://github.com/apache/seatunnel/commit/5247e17640|2.3.11| |[Fix][Connector-V2] Fix load state check in MilvusSourceReader to consider partition-level status (#8937)|https://github.com/apache/seatunnel/commit/bde235090b|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Core] Refactor common options of column/row (#7911)|https://github.com/apache/seatunnel/commit/d1582afee6|2.3.9| |[Feature] [connector-milvus] update milvus connector to support dynamic schema, failed retry, etc. (#7885)|https://github.com/apache/seatunnel/commit/6a31f91729|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Improve][Connector-V2] Optimize milvus code (#7691)|https://github.com/apache/seatunnel/commit/1eddb8e1b1|2.3.8| |[Improve] [Connector-V2] Optimize milvus-connector config code (#7658)|https://github.com/apache/seatunnel/commit/f831f7a5ec|2.3.8| |[Improve][Connector-V2] update vectorType (#7446)|https://github.com/apache/seatunnel/commit/1bba72385b|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Feature][Connector-V2] Fake Source support produce vector data (#7401)|https://github.com/apache/seatunnel/commit/6937d10ac3|2.3.8| |[Feature][Connector-V2][Milvus] Support Milvus source & sink (#7158)|https://github.com/apache/seatunnel/commit/0c69b9166e|2.3.6|
================================================ FILE: docs/en/connectors/changelog/connector-mongodb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[fix][connector-mango] fix split with avgSize zero error (#9255)|https://github.com/apache/seatunnel/commit/564863b933|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][MongoDB] The Long type cannot handle string values in scientific notation (#8783)|https://github.com/apache/seatunnel/commit/00f550e3d0|2.3.11| |[Improve] sink mongodb schema is not required (#8887)|https://github.com/apache/seatunnel/commit/3cfe8c12b9|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Connector-Mongodb] close MongodbClient when close MongodbReader (#8592)|https://github.com/apache/seatunnel/commit/06b2fc0e06|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Bug][connectors-v2] fix mongodb bson convert exception (#8044)|https://github.com/apache/seatunnel/commit/b222c13f2f|2.3.9| |[Hotfix][Connector-v2] Fix the ClassCastException for connector-mongodb (#7586)|https://github.com/apache/seatunnel/commit/dc43370e8c|2.3.8| |[Improve][Test][Connector-V2][MongoDB] Add few test cases for BsonToRowDataConverters (#7579)|https://github.com/apache/seatunnel/commit/a797041e5d|2.3.8| |[Improve][Connector-V2][MongoDB] A BsonInt32 will be convert to a long type (#7567)|https://github.com/apache/seatunnel/commit/adf26c20c5|2.3.8| |[Improve][Connector-V2][MongoDB] Support to convert to double from any numeric type (#6997)|https://github.com/apache/seatunnel/commit/c5159a2760|2.3.6| |[bugfix][connector-mongodb] fix mongodb null value write (#6967)|https://github.com/apache/seatunnel/commit/c5ecda50f8|2.3.6| |[Improve][MongoDB] Implement TableSourceFactory to create mongodb source (#5813)|https://github.com/apache/seatunnel/commit/59cccb6097|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[bugfix][mongodb] Fixed unsupported exception caused by bsonNull (#5659)|https://github.com/apache/seatunnel/commit/cab864aa4d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Improve][Connector-v2][Mongodb]sink support transaction update/writing (#5034)|https://github.com/apache/seatunnel/commit/b1203c905e|2.3.3| |[Hotfix][Connector-V2][Mongodb] Compatible with historical parameters (#4997)|https://github.com/apache/seatunnel/commit/31db35bee7|2.3.3| |[Improve][Connector-v2][Mongodb]Optimize reading logic (#5001)|https://github.com/apache/seatunnel/commit/830196d8b7|2.3.3| |[Hotfix][Connector-V2][Mongodb] Fix document error content and remove redundant code (#4982)|https://github.com/apache/seatunnel/commit/526197af67|2.3.3| |[Feature][connector-v2][mongodb] mongodb support cdc sink (#4833)|https://github.com/apache/seatunnel/commit/cb651cd7f3|2.3.3| |[Feature][Connector-v2][Mongodb]Refactor mongodb connector (#4620)|https://github.com/apache/seatunnel/commit/5b1a843e40|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve] mongodb connector v2 add source query capability (#3697)|https://github.com/apache/seatunnel/commit/8a7fe6fcb6|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MongoDB] Unified exception for MongoDB source & sink connector (#3522)|https://github.com/apache/seatunnel/commit/5af632e32b|2.3.0| |[Feature][Connector V2] expose configurable options in MongoDB (#3347)|https://github.com/apache/seatunnel/commit/ffd5778efc|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2] Improve mongodb connector (#2778)|https://github.com/apache/seatunnel/commit/efbf793fa5|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Feature][Connector-V2] Add mongodb connecter sink (#2694)|https://github.com/apache/seatunnel/commit/51c28a3387|2.2.0-beta| |[Feature][Connector-V2] Add mongodb connecter source (#2596)|https://github.com/apache/seatunnel/commit/3ee8a8a619|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-neo4j.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] neo4j options (#9164)|https://github.com/apache/seatunnel/commit/1eb81e7f88|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Improve][connector-V2-Neo4j]Supports neo4j sink batch write and update docs (#4841)|https://github.com/apache/seatunnel/commit/580276a8bd|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Neo4j] Unified exception for Neo4j source & sink connector (#3565)|https://github.com/apache/seatunnel/commit/58584eefb1|2.3.0| |[Feature][Connector][Neo4j] expose configurable options in Neo4j (#3342)|https://github.com/apache/seatunnel/commit/efa04b38fe|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-v2] Neo4j source connector (#2777)|https://github.com/apache/seatunnel/commit/38b0daf8b7|2.3.0| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-v2] Neo4j sink connector (#2434)|https://github.com/apache/seatunnel/commit/950b27d132|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-openmldb.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] openmldb options (#9166)|https://github.com/apache/seatunnel/commit/d324fc59a4|2.3.11| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Conenctor-V2] Add openmldb source connector (#3313)|https://github.com/apache/seatunnel/commit/e68ecf7bef|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-paimon.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connectors-v2] Clean up temporary files for paimon sink (#9819)|https://github.com/apache/seatunnel/commit/c43d57de31| dev | |[Feature][Connector-v2] Support multi paimon source (#9759)|https://github.com/apache/seatunnel/commit/0d52102241|2.3.12| |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][connector-paimon] Paimon connector supports paimon privilege (#9722)|https://github.com/apache/seatunnel/commit/b2bb2f8d78|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[fix][connectors-v2] repeated commit cause task exceptions (#9665)|https://github.com/apache/seatunnel/commit/085023ad0d|2.3.12| |[Improve][Connector-V2] Support like predicate pushdown in paimon (#9653)|https://github.com/apache/seatunnel/commit/9e01c84e76|2.3.12| |[Feature][Connectors-v2]Paimon version upgrade to 1.1.1 (#8074)|https://github.com/apache/seatunnel/commit/96b26a68dc|2.3.12| |[Fix][Connectors-v2] fix dynamic bucket for paimon sink (#9595)|https://github.com/apache/seatunnel/commit/d29a531a48|2.3.12| |[Feature][Connector-V2] Support like predicate pushdown in paimon (#9484)|https://github.com/apache/seatunnel/commit/a19720ccf6|2.3.12| |[Fix][Connector-V2] Update waitCompaction value for batch mode and writeonly (#9479)|https://github.com/apache/seatunnel/commit/63993a6197|2.3.12| |[Future][Connector-V2]Support the automatic creation of non-primary key table (#9219)|https://github.com/apache/seatunnel/commit/93e539cc9f|2.3.12| |[Fix][Connector-V2] Optimize Paimon DECIMAL type check to prevent precision loss (#9480)|https://github.com/apache/seatunnel/commit/c114682a6b|2.3.12| |[Bug][Connector-V2] fix NPE when decimal type precision is incompatible for Paimon (#9452)|https://github.com/apache/seatunnel/commit/37762c93f0|2.3.12| |[feature][connectors-v2] Support in predicate pushdown in paimon (#9379)|https://github.com/apache/seatunnel/commit/1ec43755d5|2.3.12| |[Improve][Connector-V2] Fix the word misspellings for paimon connector (#9332)|https://github.com/apache/seatunnel/commit/ba7f5c9e30|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[improve] paimon options (#9167)|https://github.com/apache/seatunnel/commit/b0889305c2|2.3.11| |[Fix][Paimon] nullable and comment attribute was lost during automatic table creation (#9020)|https://github.com/apache/seatunnel/commit/eb54fdd52c|2.3.11| |[Feature][Connector-V2] Support between predicate pushdown in paimon (#8962)|https://github.com/apache/seatunnel/commit/3b141cf621|2.3.10| |[Feature][Connector-V2] Suppor Time type in paimon connector (#8880)|https://github.com/apache/seatunnel/commit/9f1e590091|2.3.10| |[Feature][Paimon] Customize the hadoop user (#8888)|https://github.com/apache/seatunnel/commit/2657626f93|2.3.10| |[Improve][Connector-v2][Paimon]PaimonCatalog close error message update (#8640)|https://github.com/apache/seatunnel/commit/48253da8d6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Connector-v2] Support checkpoint in batch mode for paimon sink (#8333)|https://github.com/apache/seatunnel/commit/f22d4ebd4d|2.3.9| |[Feature][Connector-v2] Support schema evolution for paimon sink (#8211)|https://github.com/apache/seatunnel/commit/57190e2a3b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-v2] Support S3 filesystem of paimon connector (#8036)|https://github.com/apache/seatunnel/commit/e2a4772933|2.3.9| |[Feature][transform] transform support explode (#7928)|https://github.com/apache/seatunnel/commit/132278c06a|2.3.9| |[Feature][Connector-V2] Piamon Sink supports changelog-procuder is lookup and full-compaction mode (#7834)|https://github.com/apache/seatunnel/commit/c0f27c2f76|2.3.9| |[Fix][connector-v2]Fix Paimon table connector Error log information. (#7873)|https://github.com/apache/seatunnel/commit/a3b49e6354|2.3.9| |[Improve][Connector-v2] Use checkpointId as the commit's identifier instead of the hash for streaming write of paimon sink (#7835)|https://github.com/apache/seatunnel/commit/c7a384af2b|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connecotr-V2] Fix paimon dynamic bucket tale in primary key is not first (#7728)|https://github.com/apache/seatunnel/commit/dc7f695537|2.3.8| |[Improve][Connector-v2] Remove useless code and add changelog doc for paimon sink (#7748)|https://github.com/apache/seatunnel/commit/846d876dc2|2.3.8| |[Hotfix][Connector-V2] Release resources even the task is crashed for paimon sink (#7726)|https://github.com/apache/seatunnel/commit/5ddf8d461e|2.3.8| |[Fix][Connector-V2] Fix paimon e2e error (#7721)|https://github.com/apache/seatunnel/commit/61d1964361|2.3.8| |[Feature][Connector-Paimon] Support dynamic bucket splitting improves Paimon writing efficiency (#7335)|https://github.com/apache/seatunnel/commit/bc0326cba8|2.3.8| |[Feature][Connector-v2] Support streaming read for paimon (#7681)|https://github.com/apache/seatunnel/commit/4a2e27291c|2.3.8| |[Hotfix][Seatunnel-common] Fix the CommonError msg for paimon sink (#7591)|https://github.com/apache/seatunnel/commit/d1f5db9257|2.3.8| |[Feature][CONNECTORS-V2-Paimon] Paimon Sink supported truncate table (#7560)|https://github.com/apache/seatunnel/commit/4f3df22124|2.3.8| |[Improve][Connector-v2] Improve the exception msg in case-sensitive case for paimon sink (#7549)|https://github.com/apache/seatunnel/commit/7d31e5668c|2.3.8| |[Hotfix][Connector-V2] Fixed lost data precision for decimal data types (#7527)|https://github.com/apache/seatunnel/commit/df210ea73d|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |The isNullable attribute is true when the primary key field in the Paimon table converts the Column object. #7231 (#7242)|https://github.com/apache/seatunnel/commit/b0fe432e99|2.3.6| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Paimon]support projection for paimon source (#6343)|https://github.com/apache/seatunnel/commit/6c1577267f|2.3.6| |[Improve][Paimon] Add check for the base type between source and sink before write. (#6953)|https://github.com/apache/seatunnel/commit/d56d64fc04|2.3.6| |[Improve][Connector-V2] Improve the paimon source (#6887)|https://github.com/apache/seatunnel/commit/658643ae53|2.3.6| |[Hotfix][Connector-V2] Close the tableWrite when task is close (#6897)|https://github.com/apache/seatunnel/commit/23a744b9b2|2.3.6| |[Fix][Connector-V2] Field information lost during Paimon DataType and SeaTunnel Column conversion (#6767)|https://github.com/apache/seatunnel/commit/6cf6e41da7|2.3.6| |[Improve][Connector-V2] Support hive catalog for paimon sink (#6833)|https://github.com/apache/seatunnel/commit/4969c91dc4|2.3.6| |[Hotfix][Connector-V2] Fix the batch write with paimon (#6865)|https://github.com/apache/seatunnel/commit/9ec971d942|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve][Connector-V2] Support hadoop ha and kerberos for paimon sink (#6585)|https://github.com/apache/seatunnel/commit/20b62f3bf3|2.3.5| |[Feature][Paimon] Support specify paimon table write properties, partition keys and primary keys (#6535)|https://github.com/apache/seatunnel/commit/2b1234c7ae|2.3.5| |[Feature][Connector-V2] Support multi-table sink feature for paimon #5652 (#6449)|https://github.com/apache/seatunnel/commit/b0abbd2d89|2.3.5| |[Feature][Connectors-v2-Paimon] Adaptation Paimon 0.6 Version (#6061)|https://github.com/apache/seatunnel/commit/b32df930e9|2.3.4| |[Fix] [Connectors-v2-Paimon] Flink table store failed to prepare commit (#6057)|https://github.com/apache/seatunnel/commit/c8dcefc3be|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Hotfix][Connector-V2][Paimon] Bump paimon-bundle version to 0.4.0-incubating (#5219)|https://github.com/apache/seatunnel/commit/2917542bfa|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Connector-V2][Paimon] Introduce paimon connector (#4178)|https://github.com/apache/seatunnel/commit/da507bbe0e|2.3.2|
================================================ FILE: docs/en/connectors/changelog/connector-prometheus.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-V2] Fix prometheus check time can not parse double value (#9311)|https://github.com/apache/seatunnel/commit/fbf78721ab|2.3.12| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Fix][Connector-V2] Fix cdc use default value when value is null (#7950)|https://github.com/apache/seatunnel/commit/3b432125ae|2.3.9| |[Feature][Connector-V2] Add prometheus source and sink (#7265)|https://github.com/apache/seatunnel/commit/dde6f9fcbd|2.3.9|
================================================ FILE: docs/en/connectors/changelog/connector-pulsar.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[improve] pulsar options (#9180)|https://github.com/apache/seatunnel/commit/26a2160c80|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[PulsarSource]Improve pulsar throughput performance. (#6234)|https://github.com/apache/seatunnel/commit/37461f4f3e|2.3.4| |[Feature][Connector-v2][PulsarSink]Add Pulsar Sink Connector. (#4382)|https://github.com/apache/seatunnel/commit/543d2c5086|2.3.4| |[Chore] Remove useless DeserializationFormatFactory and its implement (#5880)|https://github.com/apache/seatunnel/commit/f0511544ff|2.3.4| |fix: update IDENTIFIER = Pulsar for pulsar-datasource on project:seatunnel-web (#5852)|https://github.com/apache/seatunnel/commit/3b6de3743e|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Json-format] support read format for pulsar (#4111)|https://github.com/apache/seatunnel/commit/7d61ae93e7|2.3.2| |[hotfix][pulsar] Fix the bug that can't consume messages all the time. (#4125)|https://github.com/apache/seatunnel/commit/a6705cc5bf|2.3.2| |[Feature] add cdc multiple table support & fix zeta bug|https://github.com/apache/seatunnel/commit/533ff2c2fa|2.3.1| |[hotfix][pulsar] PulsarSource consumer ack exception. (#4237)|https://github.com/apache/seatunnel/commit/9725d675da|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve][Connector-v2][Pulsar] Set the name of the pulsar consumption thread. (#4182)|https://github.com/apache/seatunnel/commit/e567203f7d|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Bug][Connector-v2][PulsarSource]Fix pulsar option topic-pattern bug. (#3989)|https://github.com/apache/seatunnel/commit/aee2c580ea|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2][Pulsar] Unified exception for Pulsar source &… (#3590)|https://github.com/apache/seatunnel/commit/4fe9323419|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Hotfix][Connector-V2][Pulsar] fix conditional options (#3504)|https://github.com/apache/seatunnel/commit/0066affacf|2.3.0| |[Feature][Connector][pulsar] expose configurable options in Pulsar (#3341)|https://github.com/apache/seatunnel/commit/200faa7c29|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[chore] fix pulsar consumer comment error (#3356)|https://github.com/apache/seatunnel/commit/91e632c526|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[hotfix][connector][pulsar] Fix not being able to mark #noMoreNewSplits when restoring (#2945)|https://github.com/apache/seatunnel/commit/5ad69076b3|2.3.0-beta| |Move Handover to common module (#2877)|https://github.com/apache/seatunnel/commit/d94a874bcb|2.3.0-beta| |[hotfix][connector-v2] fix pulsar source exceptions (#2820)|https://github.com/apache/seatunnel/commit/8ff0ba7015|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[SeaTunnel]Simply seatunnel package pipeline. (#2563)|https://github.com/apache/seatunnel/commit/9d88b6221a|2.2.0-beta| |[Improve][Connector-V2] Pulsar support user-defined schema (#2436)|https://github.com/apache/seatunnel/commit/16cabe6a35|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[doc][connector-v2] pulsar source options doc (#2128)|https://github.com/apache/seatunnel/commit/59ce8a2b32|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-qdrant.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[improve] qdrant options (#9235)|https://github.com/apache/seatunnel/commit/f3a45cd131|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Support Qdrant sink and source connector (#7299)|https://github.com/apache/seatunnel/commit/c8590716ae|2.3.8|
================================================ FILE: docs/en/connectors/changelog/connector-rabbitmq.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][connector-rabbitmq] Set default value for durable, exclusive and auto-delete (#9631)|https://github.com/apache/seatunnel/commit/5f9492e62a|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] rabbit mq options (#8740)|https://github.com/apache/seatunnel/commit/4eec9be012|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Rabbitmq] Allow configuration of queue durability and deletion policy (#7365)|https://github.com/apache/seatunnel/commit/aabfc8eb78|2.3.8| |[Hotfix][connector-v2-rabbit] fix rabbit checkpoint exception in Flink mode (#7108)|https://github.com/apache/seatunnel/commit/423a7b142b|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Bugfix][connector-v2][rabbitmq] Fix reduplicate ack msg bug and code style (#4842)|https://github.com/apache/seatunnel/commit/985fb6642a|2.3.2| |[Hotfix][E2E] Fix RabbitmqIT (#4593)|https://github.com/apache/seatunnel/commit/9bd5403d71|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| | [Feature][Connector-V2][RabbitMQ] Add RabbitMQ source & sink connector (#3312)|https://github.com/apache/seatunnel/commit/4b12691a8d|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-redis.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Connector-V2] Use key_field_name option when reading Redis hash data (#9642)|https://github.com/apache/seatunnel/commit/5d214a7305|2.3.12| |[Feature][Redis] Add redis key into the result record (#9574)|https://github.com/apache/seatunnel/commit/6e8b7c5da5|2.3.12| |[Fix][Connector-Redis] Redis did not write successfully, but the task did not fail (#9055)|https://github.com/apache/seatunnel/commit/07510ed937|2.3.11| |[hotfix][redis] fix npe cause by null host parameter (#8881)|https://github.com/apache/seatunnel/commit/7bd5865165|2.3.10| |[Improve][Redis] Optimized Redis connection params (#8841)|https://github.com/apache/seatunnel/commit/e56f06cdf0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] update Redis connector config option (#8631)|https://github.com/apache/seatunnel/commit/f1c313eea6|2.3.10| |[Feature][Redis] Flush data when the time reaches checkpoint.interval and update test case (#8308)|https://github.com/apache/seatunnel/commit/e15757bcd7|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][Redis] Flush data when the time reaches checkpoint.interval (#8198)|https://github.com/apache/seatunnel/commit/2e24941e6a|2.3.9| |[Hotfix] Fix redis sink NPE (#8171)|https://github.com/apache/seatunnel/commit/6b9074e769|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature] [Connector-Redis] Redis connector support delete data (#7994)|https://github.com/apache/seatunnel/commit/02a35c3979|2.3.9| |[Improve][Connector-V2] Redis support custom key and value (#7888)|https://github.com/apache/seatunnel/commit/ef2c3c7283|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[improve][Redis]Redis scan command supports versions 5, 6, 7 (#7666)|https://github.com/apache/seatunnel/commit/6e70cbe334|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Redis] Redis reader use scan cammnd instead of keys, single mode reader/writer support batch (#7087)|https://github.com/apache/seatunnel/commit/be37f05c07|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][Connector-V2]Support multi-table sink feature for redis (#6314)|https://github.com/apache/seatunnel/commit/fed89ae3fc|2.3.5| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector-v2][Redis] Redis support select db (#5570)|https://github.com/apache/seatunnel/commit/77fbbbd0ee|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature][Connector-v2][RedisSink]Support redis to set expiration time. (#4975)|https://github.com/apache/seatunnel/commit/b5321ff1d2|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Redis] Unified exception for redis source & sink exception (#3517)|https://github.com/apache/seatunnel/commit/205f782585|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[feature][api] add option validation for the ReadonlyConfig (#3417)|https://github.com/apache/seatunnel/commit/4f824fea36|2.3.0| |[Feature][Redis Connector V2] Add Redis Connector Option Rules & Improve Redis Connector doc (#3320)|https://github.com/apache/seatunnel/commit/1c10aacb30|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][Redis] Support redis cluster connection & user authentication (#3188)|https://github.com/apache/seatunnel/commit/c7275a49cc|2.3.0| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Feature][Connector-V2] Add redis sink connector (#2647)|https://github.com/apache/seatunnel/commit/71a9e4b019|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Add redis source connector (#2569)|https://github.com/apache/seatunnel/commit/405f7d6f99|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-rocketmq.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[improve] rocketmq options (#9251)|https://github.com/apache/seatunnel/commit/4cbe3b9172|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve][Connector-V2] RocketMQ Source add message tag config (#8825)|https://github.com/apache/seatunnel/commit/5913e8c35f|2.3.10| |[Improve][Connector-V2] Add optional flag for rocketmq connector to skip parse errors instead of failing (#8737)|https://github.com/apache/seatunnel/commit/701f17b5d4|2.3.10| |[Improve][Connector-V2] RocketMQ Sink add message tag config (#7996)|https://github.com/apache/seatunnel/commit/97a1b00e48|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Fix][connector-rocketmq] commit a correct offset to broker & reduce ThreadInterruptedException log (#6668)|https://github.com/apache/seatunnel/commit/b7480e1a89|2.3.6| |[fix][connector-rocketmq]Fix a NPE problem when checkpoint.interval is set too small(#6624) (#6625)|https://github.com/apache/seatunnel/commit/6e0c81d492|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Fix] [Connector] Rocketmq source startOffset greater than endOffset error (#6287)|https://github.com/apache/seatunnel/commit/cd44b5894e|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Improve][pom] Formatting pom (#4761)|https://github.com/apache/seatunnel/commit/1d6d3815ec|2.3.2| |[Hotfix][Connector-V2][RocketMQ] Fix rocketmq spark e2e test cases (#4583)|https://github.com/apache/seatunnel/commit/e711f6ef4c|2.3.2| |[Feature][Connector-V2] Add rocketmq source and sink (#4007)|https://github.com/apache/seatunnel/commit/e333897552|2.3.2|
================================================ FILE: docs/en/connectors/changelog/connector-s3-redshift.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][Connector[File] Optimize files commit order (#5045)|https://github.com/apache/seatunnel/commit/1e18a8c530|2.3.3| |[BugFix] Fix S3Redshift connector copy file to redshift but file not found bug (#4282)|https://github.com/apache/seatunnel/commit/bcac24ebfc|2.3.1| |[Fix] [Bug] Fix S3RedShift is not correct with S3 (#4291)|https://github.com/apache/seatunnel/commit/7b72dd95a2|2.3.1| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Connector][Sink]Support load data to S3 then Copy to Redshift (#3736)|https://github.com/apache/seatunnel/commit/8ef080f200|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-selectdb-cloud.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] selectdb options (#9252)|https://github.com/apache/seatunnel/commit/1b44b9b440|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Add disable 2pc in SelectDB cloud sink (#6266)|https://github.com/apache/seatunnel/commit/aa0b2119a7|2.3.5| |[Feature] Support nanosecond in SelectDB DateTimeV2 type (#6332)|https://github.com/apache/seatunnel/commit/a0ef5dac93|2.3.5| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[improve][SelectDB] Add a jobId to the selectDB label to distinguish between tasks (#4864)|https://github.com/apache/seatunnel/commit/84be0f9fd0|2.3.2| |[Improve][Connector-V2][SelectDB Cloud]Refactor some SelectDB Cloud Sink code as well as support copy into batch and async flush and cdc (#4312)|https://github.com/apache/seatunnel/commit/11e94b216f|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][SelectDB Cloud] Support SelectDB Cloud Sink Connector (#3958)|https://github.com/apache/seatunnel/commit/79a134a03b|2.3.1|
================================================ FILE: docs/en/connectors/changelog/connector-sensorsdata.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][connector-v2] Add Sensorsdata Connector Support #9323 (#9432)|https://github.com/apache/seatunnel/commit/bb53f77264|2.3.12|
================================================ FILE: docs/en/connectors/changelog/connector-sentry.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] sentry options (#9261)|https://github.com/apache/seatunnel/commit/4a2f3fa915|2.3.11| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Sentry] Unified exception for sentry sink connector (#3513)|https://github.com/apache/seatunnel/commit/94b472b806|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[Feature][Sentry Sink V2] Add Sentry Sink Option Rules (#3318)|https://github.com/apache/seatunnel/commit/850f483816|2.3.0| |[Feature][Connector-V2] Add sentry sink connector #2244 (#2584)|https://github.com/apache/seatunnel/commit/9fd40390a7|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-slack.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] Slack connector options (#8738)|https://github.com/apache/seatunnel/commit/eb706743fe|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Slack] Add Slack sink connector (#3226)|https://github.com/apache/seatunnel/commit/7a836f2d44|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-sls.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[improve] sls options (#9260)|https://github.com/apache/seatunnel/commit/126164508b|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature]Check Chinese comments in the code (#8319)|https://github.com/apache/seatunnel/commit/d58fce1caf|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Sls] Add sls sink connector、e2e、doc (#7830)|https://github.com/apache/seatunnel/commit/048c47d966|2.3.9| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Feature][Connector-V2] add Aliyun SLS connector #3733 (#7348)|https://github.com/apache/seatunnel/commit/527c7c7b5f|2.3.7|
================================================ FILE: docs/en/connectors/changelog/connector-socket.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] socket options (#9517)|https://github.com/apache/seatunnel/commit/af83a302cf|2.3.12| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Socket] Unified exception for socket source & sink connector (#3511)|https://github.com/apache/seatunnel/commit/581292f210|2.3.0| |[feature][connector][socket] Add Socket Connector Option Rules (#3317)|https://github.com/apache/seatunnel/commit/b85317bcbe|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Socket Connector Sink (#2549)|https://github.com/apache/seatunnel/commit/94f4600a4e|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
================================================ FILE: docs/en/connectors/changelog/connector-starrocks.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Fix][Doc] Update StarRocks doc change schema necessity to true (#9656)|https://github.com/apache/seatunnel/commit/45f8ac6d1d|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Fix][Connector-V2] Fix starrocks decimal column definition generation(#9470) (#9471)|https://github.com/apache/seatunnel/commit/64b8f1752e|2.3.12| |[Bugfix][Starrocks] Fix starrocks batch data exceeds the maximum limit (#9256)|https://github.com/apache/seatunnel/commit/84634a4d1f|2.3.11| |[Improve][Starrocks] Catch lable already exception (#9222)|https://github.com/apache/seatunnel/commit/b6fc222c0a|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] Fixed missing timestamp accuracy of starrocks connector (#9096)|https://github.com/apache/seatunnel/commit/02254b9c0e|2.3.11| |[Fix][Connector-V2] Fix StarRocksCatalogTest#testCatalog() NPE (#8987)|https://github.com/apache/seatunnel/commit/53f0a9eb52|2.3.10| |[Improve][Connector-V2] Random pick the starrocks fe address which can be connected (#8898)|https://github.com/apache/seatunnel/commit/bef76078f9|2.3.10| |[Feature][Connector-v2] Support multi starrocks source (#8789)|https://github.com/apache/seatunnel/commit/26b5529aaf|2.3.10| |[Fix][Connector-V2] Fix possible data loss in scenarios of request_tablet_size is less than the number of BUCKETS (#8768)|https://github.com/apache/seatunnel/commit/3c6f216135|2.3.10| |[Fix][Connector-V2]Fix Descriptions for CUSTOM_SQL in Connector (#8778)|https://github.com/apache/seatunnel/commit/96b610eb7e|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add StarRocks options (#8639)|https://github.com/apache/seatunnel/commit/da8d9cbd35|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[Feature][Connector-V2] Starrocks implements multi table sink (#8467)|https://github.com/apache/seatunnel/commit/55eebfa8af|2.3.9| |[Improve][Connector-V2] Add pre-check starrocks version before exeucte alter table field name (#8237)|https://github.com/apache/seatunnel/commit/c24e3b12ba|2.3.9| |[Fix][Connector-starrocks] Fix drop column bug for starrocks (#8216)|https://github.com/apache/seatunnel/commit/082814da1f|2.3.9| |[Feature][Core] Support read arrow data (#8137)|https://github.com/apache/seatunnel/commit/4710ea0f8d|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Feature][Connector-V2] StarRocks-sink support schema evolution (#8082)|https://github.com/apache/seatunnel/commit/d33b0da8ab|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add doris/starrocks create table with comment (#7847)|https://github.com/apache/seatunnel/commit/207b8c16fd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector-V2] Reuse connection in StarRocksCatalog (#7342)|https://github.com/apache/seatunnel/commit/8ee129d20f|2.3.8| |[Improve][Connector-V2] Remove system table limit (#7391)|https://github.com/apache/seatunnel/commit/adf888e008|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix][Connector-V2] Fix starrocks Content-Length header already present error (#7034)|https://github.com/apache/seatunnel/commit/a485a74eff|2.3.6| |[Feature][Connector-V2]Support StarRocks Fe Node HA|https://github.com/apache/seatunnel/commit/9c36c45819|2.3.6| |[Fix][Connector-v2] Fix the sql statement error of create table for doris and starrocks (#6679)|https://github.com/apache/seatunnel/commit/88263cd69f|2.3.6| |[Fix][StarRocks] Fix NPE when upstream catalogtable table path only have table name part (#6540)|https://github.com/apache/seatunnel/commit/5795b265cc|2.3.5| |[Fix][Connector-V2] Fixed doris/starrocks create table sql parse error (#6580)|https://github.com/apache/seatunnel/commit/f2ed1fbde0|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Connector-V2] Support TableSourceFactory on StarRocks (#6498)|https://github.com/apache/seatunnel/commit/aded56299c|2.3.5| |[Improve] StarRocksSourceReader use the existing client (#6480)|https://github.com/apache/seatunnel/commit/1a02c571a9|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature][Connector] add starrocks save_mode (#6029)|https://github.com/apache/seatunnel/commit/66b0f1e1d2|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Improve] StarRocks support create table template with unique key (#5905)|https://github.com/apache/seatunnel/commit/25b01125e4|2.3.4| |[Improve][StarRocksSink] add http socket timeout. (#5918)|https://github.com/apache/seatunnel/commit/febdb262b6|2.3.4| |[Improve] Support create varchar field type in StarRocks (#5911)|https://github.com/apache/seatunnel/commit/6025895167|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[feature][connector-jdbc]Add Save Mode function and Connector-JDBC (MySQL) connector has been realized (#5663)|https://github.com/apache/seatunnel/commit/eff17ccbe5|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Hotfix][Connector-V2][StarRocks] fix starrocks template sql parser #5071 (#5332)|https://github.com/apache/seatunnel/commit/23d79b0d17|2.3.4| |[Improve] [Connector-V2] Remove scheduler in StarRocks sink (#5269)|https://github.com/apache/seatunnel/commit/cb7b794914|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |Fix StarRocksJsonSerializer will transform array/map/row to string (#5281)|https://github.com/apache/seatunnel/commit/f941953774|2.3.3| |[Improve] Improve savemode api (#4767)|https://github.com/apache/seatunnel/commit/4acd370d48|2.3.3| |[Improve] [Connector-V2] Improve StarRocks Auto Create Table To Support Use Primary Key Template In Field (#4487)|https://github.com/apache/seatunnel/commit/e601cd4c37|2.3.2| |Revert "[Improve][Catalog] refactor catalog (#4540)" (#4628)|https://github.com/apache/seatunnel/commit/2d1933195d|2.3.2| |[hotfix][starrocks] fix error on get starrocks source typeInfo (#4619)|https://github.com/apache/seatunnel/commit/f7b094f9eb|2.3.2| |[Improve][Catalog] refactor catalog (#4540)|https://github.com/apache/seatunnel/commit/b0a701cb83|2.3.2| |[Improve] [Connector-V2] Throw StarRocks Serialize Error To Client (#4484)|https://github.com/apache/seatunnel/commit/e2c107323b|2.3.2| |[Improve] [Connector-V2] Improve StarRocks Serialize Error Message (#4458)|https://github.com/apache/seatunnel/commit/465e75cbf5|2.3.2| |[Hotfix][Zeta] Adapt StarRocks With Multi-Table And Single-Table Mode (#4324)|https://github.com/apache/seatunnel/commit/c11c171d36|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] [Zeta] Improve Client Job Info Message|https://github.com/apache/seatunnel/commit/56febf0118|2.3.1| |[Fix] [Connector-V2] Fix StarRocksSink Without Format Field In Header|https://github.com/apache/seatunnel/commit/463ae6437e|2.3.1| |[Improve] Support StarRocksCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/d00ced6ecd|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[Improve] Change StarRocks Sink Default Format To Json|https://github.com/apache/seatunnel/commit/8703357830|2.3.1| |[Fix] Fix StarRocks Default Url Can't Use|https://github.com/apache/seatunnel/commit/67c45d353a|2.3.1| |[hotfix] fixed schema options import error|https://github.com/apache/seatunnel/commit/656805f2df|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Fix] Fix StarRocks Default Url Can't Use (#4229)|https://github.com/apache/seatunnel/commit/ed74d11090|2.3.1| |[Bug] Remove StarRocks Auto Creat Table Default Value (#4220)|https://github.com/apache/seatunnel/commit/80b5cd40ae|2.3.1| |[Feature] Add SaveMode For StarRocks (#4217)|https://github.com/apache/seatunnel/commit/0674f10a53|2.3.1| |[Improve] Improve StarRocks Catalog Base Url (#4215)|https://github.com/apache/seatunnel/commit/6632a40473|2.3.1| |[Improve] Improve StarRocks Sink Config (#4212)|https://github.com/apache/seatunnel/commit/8d5712c1db|2.3.1| |[Hotfix][Zeta] keep deleteCheckpoint method synchronized (#4209)|https://github.com/apache/seatunnel/commit/061f9b5872|2.3.1| |[Improve] Improve StarRocks Auto Create Table (#4208)|https://github.com/apache/seatunnel/commit/bc9cd6bf69|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[feature][starrocks] add StarRocks factories (#4191)|https://github.com/apache/seatunnel/commit/c485d887ec|2.3.1| |[Feature] Change StarRocks CreatTable Template (#4184)|https://github.com/apache/seatunnel/commit/4cf07f3beb|2.3.1| |[Feature][Connector-V2] StarRocks source connector (#3679)|https://github.com/apache/seatunnel/commit/9681173b10|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-v2][StarRocks] Support write cdc changelog event(INSERT/UPDATE/DELETE) (#3865)|https://github.com/apache/seatunnel/commit/8e3d158c03|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Improve][Connector-V2][StarRocks] Unified exception for StarRocks source and sink (#3593)|https://github.com/apache/seatunnel/commit/612d0297a0|2.3.0| |[Improve][Connector-V2][StarRocks] Delete the Mapper may not be used (#3579)|https://github.com/apache/seatunnel/commit/1e868ecf28|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][StarRocks]Add StarRocks connector option rules (#3402)|https://github.com/apache/seatunnel/commit/5d187f69b7|2.3.0| |[Bugfix][Connector-V2][StarRocks]Fix StarRocks StreamLoad retry bug and fix doc (#3406)|https://github.com/apache/seatunnel/commit/071f9aa055|2.3.0| |[Feature][Connector-V2] Starrocks sink connector (#3164)|https://github.com/apache/seatunnel/commit/3e6caf7053|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-tablestore.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve] table_store options (#9515)|https://github.com/apache/seatunnel/commit/145b68793f|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| | [Feature][Connector-V2][Tablestore] Support Source connector for Tablestore #7448 (#7467)|https://github.com/apache/seatunnel/commit/a7ca51b585|2.3.8| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] [Connector-V2] Remove scheduler in Tablestore sink (#5272)|https://github.com/apache/seatunnel/commit/8d6b07e466|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][TableStore] Unified excetion for TableStore sink connector (#3527)|https://github.com/apache/seatunnel/commit/7b264d7004|2.3.0| |[Feature][connector-v2] add tablestore source and sink (#3309)|https://github.com/apache/seatunnel/commit/ebebf0b633|2.3.0|
================================================ FILE: docs/en/connectors/changelog/connector-tdengine.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][connector-tdengine] Support subtable and fieldNames in tdengine source (#9593)|https://github.com/apache/seatunnel/commit/b136a0dc43|2.3.12| |[improve] tdengine options (#9399)|https://github.com/apache/seatunnel/commit/ff122fe405|2.3.12| |[Feature][Connector-V2] Support multi-table sink feature for TDengine (#9215)|https://github.com/apache/seatunnel/commit/98b593f095|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] Fix NullPointerException when column or tag contains null value in TDengine sink (#9158)|https://github.com/apache/seatunnel/commit/a047cab546|2.3.11| |[Fix][Connector][TDEngine] TDEngine support NCHAR type (#8411)|https://github.com/apache/seatunnel/commit/88c92ae1b1|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Fix][Connector-tdengine] Fix sql exception and concurrentmodifyexception when connect to taos and read data|https://github.com/apache/seatunnel/commit/a18fca8006|2.3.7| |[Bugfix][TDengine] Fix the issue of losing the driver due to multiple calls to the submit job REST API #6581 (#6596)|https://github.com/apache/seatunnel/commit/470bb97434|2.3.5| |[improve][connector-tdengine] support read bool column from tdengine (#6025)|https://github.com/apache/seatunnel/commit/af39235ee3|2.3.4| |[Bugfix][TDengine] Fix the degree of multiple parallelism affects driver loading (#6020)|https://github.com/apache/seatunnel/commit/b6ebbd47b2|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix][Connector] Fixed TDengine connector using jdbc driver to cause loading error (#4598)|https://github.com/apache/seatunnel/commit/78f7989b81|2.3.2| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2] add tdengine source (#2832)|https://github.com/apache/seatunnel/commit/acf4d5b1b4|2.3.1|
================================================ FILE: docs/en/connectors/changelog/connector-typesense.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[improve] typesense options (#9398)|https://github.com/apache/seatunnel/commit/bf20a3e6a8|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix] Fix error log name for SourceSplitEnumerator implements class (#8817)|https://github.com/apache/seatunnel/commit/55ed90ecaf|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature]Check Chinese comments in the code (#8319)|https://github.com/apache/seatunnel/commit/d58fce1caf|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Feature][Connector-V2] Support typesense connector (#7450)|https://github.com/apache/seatunnel/commit/138d2a4eb2|2.3.8|
================================================ FILE: docs/en/connectors/changelog/connector-web3j.md ================================================
Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] update Web3j connector config option (#9005)|https://github.com/apache/seatunnel/commit/9204f289d8|2.3.10| |[Feature][Connector-V2] Add web3j source connector (#6598)|https://github.com/apache/seatunnel/commit/b7002bfaf4|2.3.6|
================================================ FILE: docs/en/connectors/common-options/sink-common-options.md ================================================ --- sidebar_position: 4 --- # Sink Common Options > Common parameters of sink connectors :::caution warn The old configuration name `source_table_name` is deprecated, please migrate to the new name `plugin_input` as soon as possible. ::: | Name | Type | Required | Default | Description | |--------------|--------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | plugin_input | String | No | - | When `plugin_input` is not specified, the current plug-in processes the data set `dataset` output by the previous plugin in the configuration file
When `plugin_input` is specified, the current plug-in is processing the data set corresponding to this parameter. | # Important note When the job configuration `plugin_input` you must set the `plugin_output` parameter ## Task Example ### Simple > This is the process of passing a data source through two transforms and returning two different pipiles to different sinks ```bash source { FakeSourceStream { parallelism = 2 plugin_output = "fake" field_name = "name,age" } } transform { Filter { plugin_input = "fake" fields = [name] plugin_output = "fake_name" } Filter { plugin_input = "fake" fields = [age] plugin_output = "fake_age" } } sink { Console { plugin_input = "fake_name" } Console { plugin_input = "fake_age" } } ``` > If the job only have one source and one(or zero) transform and one sink, You do not need to specify `plugin_input` and `plugin_output` for connector. > If the number of any operator in source, transform and sink is greater than 1, you must specify the `plugin_input` and `plugin_output` for each connector in the job. ================================================ FILE: docs/en/connectors/common-options/source-common-options.md ================================================ --- sidebar_position: 3 --- # Source Common Options > Common parameters of source connectors :::caution warn The old configuration name `result_table_name` is deprecated, please migrate to the new name `plugin_output` as soon as possible. ::: | Name | Type | Required | Default | Description | |---------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | plugin_output | String | No | - | When `plugin_output` is not specified, the data processed by this plugin will not be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)`
When `plugin_output` is specified, the data processed by this plugin will be registered as a data set `(dataStream/dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The data set `(dataStream/dataset)` registered here can be directly accessed by other plugins by specifying `plugin_input` . | | parallelism | Int | No | - | When `parallelism` is not specified, the `parallelism` in env is used by default.
When parallelism is specified, it will override the parallelism in env. | # Important note When the job configuration `plugin_output` you must set the `plugin_input` parameter ## Task Example ### Simple > This registers a stream or batch data source and returns the table name `fake_table` at registration ```bash source { FakeSourceStream { plugin_output = "fake_table" } } ``` ### Multiple Pipeline Simple > This is to convert the data source fake and write it to two different sinks ```bash env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" c_timestamp = "timestamp" c_date = "date" c_map = "map" c_array = "array" c_decimal = "decimal(30, 8)" c_row = { c_row = { c_int = int } } } } } } transform { Sql { plugin_input = "fake" plugin_output = "fake1" # the query table name must same as field 'plugin_input' query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from dual" } # The SQL transform support base function and criteria operation # But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like } sink { Console { plugin_input = "fake1" } Console { plugin_input = "fake" } } ``` ================================================ FILE: docs/en/connectors/connector-isolated-dependency.md ================================================ # Connector Isolated Dependency Loading Mechanism SeaTunnel provides an isolated dependency loading mechanism for each connector, making it easier for users to manage individual dependencies for different connectors, while avoiding dependency conflicts and improving system extensibility. When loading a connector, SeaTunnel will search for and load the connector's own dependency jars from the `${SEATUNNEL_HOME}/plugins/connector-xxx` directory. This ensures that the dependencies required by different connectors do not interfere with each other, which is helpful for managing a large number of connectors in complex environments. ## Principle Each connector needs to place its own dependency jars in a dedicated subdirectory under `${SEATUNNEL_HOME}/plugins/connector-xxx` (manual creation required). The subdirectory name is specified by the value in the `plugin-mapping` file. When SeaTunnel starts and loads connectors, it will only load jars from the corresponding directory, thus achieving dependency isolation. Currently, the Zeta engine ensures that jars for different connectors in the same job are loaded separately. The other two engines still load all connector dependency jars together, so placing different versions of jars for the same job in Spark/Flink environments may cause dependency conflicts. ## Directory Structure Example - Use `${SEATUNNEL_HOME}/connectors/plugin-mapping.properties` to get the folder name for each connector. For example, for AmazonDynamodb, suppose the following configuration exists in the `plugin-mapping` file: ``` seatunnel.source.AmazonDynamodb = connector-amazondynamodb ``` The corresponding connector dependency directory is the value `connector-amazondynamodb`. The final directory structure is as follows: ``` SEATUNNEL_HOME/ plugins/ connector-amazondynamodb/ dependency1.jar dependency2.jar connector-xxx/ dependencyA.jar dependencyB.jar ``` ## Limitations - For the Zeta engine, please ensure that the `${SEATUNNEL_HOME}/plugins/connector-xxx` directory structure is consistent across all nodes. Each node must contain the same subdirectories and dependency jars. - Any directory or jar that does not start with `connector-` will be treated as a common dependency directory, and all engines and connectors will load such jars. - In the Zeta engine, you can achieve shared dependencies for all connectors by placing common jars in the `${SEATUNNEL_HOME}/lib/` directory. ## Verification - By checking the job logs, you can confirm that each connector only loads its own dependency jars. ```log 2025-08-13T17:55:48.7732601Z [] 2025-08-13 17:55:47,270 INFO org.apache.seatunnel.plugin.discovery.AbstractPluginDiscovery - find connector jar and dependency for PluginIdentifier{engineType='seatunnel', pluginType='source', pluginName='Jdbc'}: [file:/tmp/seatunnel/plugins/Jdbc/lib/vertica-jdbc-12.0.3-0.jar, file:/tmp/seatunnel/connectors/connector-jdbc-3.0.0-SNAPSHOT-2.12.15.jar] ``` ================================================ FILE: docs/en/connectors/formats/avro.md ================================================ # Avro format Avro is very popular in streaming data pipeline. Now seatunnel supports Avro format in kafka connector. # How To Use ## Kafka uses example - This is an example to generate data from fake source and sink to kafka with avro format. ```bash env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { row.num = 90 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } plugin_output = "fake" } } sink { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "test_avro_topic_fake_source" format = avro } } ``` - This is an example read data from kafka with avro format and print to console. ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "test_avro_topic" plugin_output = "kafka_table" start_mode = "earliest" format = avro format_error_handle_way = skip schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Console { plugin_input = "kafka_table" } } ``` ================================================ FILE: docs/en/connectors/formats/canal-json.md ================================================ # Canal Format Changelog-Data-Capture Format Format: Serialization Schema Format: Deserialization Schema Canal is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into other systems. Canal provides a unified format schema for changelog and supports to serialize messages using JSON and protobuf (protobuf is the default format for Canal). SeaTunnel supports to interpret Canal JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as synchronizing incremental data from databases to other systems auditing logs real-time materialized views on databases temporal join changing history of a database table and so on. SeaTunnel also supports to encode the INSERT/UPDATE/DELETE messages in SeaTunnel as Canal JSON messages, and emit to storage like Kafka. However, currently SeaTunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, SeaTunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT Canal messages. # Format Options | Option | Default | Required | Description | |--------------------------------|---------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | format | (none) | yes | Specify what format to use, here should be 'canal_json'. | | canal_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | | canal_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | | canal_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | # How to use ## Kafka Uses Example Canal provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: ```bash { "data": [ { "id": "111", "name": "scooter", "description": "Big 2-wheel scooter", "weight": "5.18" } ], "database": "inventory", "es": 1589373560000, "id": 9, "isDdl": false, "mysqlType": { "id": "INTEGER", "name": "VARCHAR(255)", "description": "VARCHAR(512)", "weight": "FLOAT" }, "old": [ { "weight": "5.15" } ], "pkNames": [ "id" ], "sql": "", "sqlType": { "id": 4, "name": 12, "description": 12, "weight": 7 }, "table": "products", "ts": 1589373560798, "type": "UPDATE" } ``` Note: please refer to [Canal documentation](https://github.com/alibaba/canal/wiki) about the meaning of each fields. The MySQL products table has 4 columns (id, name, description and weight). The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.15 to 5.18. Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following SeaTunnel to consume this topic and interpret the change events. ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } } transform { } sink { Kafka { bootstrap.servers = "localhost:9092" topic = "consume-binlog" format = canal_json } } ``` ================================================ FILE: docs/en/connectors/formats/cdc-compatible-debezium-json.md ================================================ # CDC Compatible Debezium-json SeaTunnel supports to interpret cdc record as Debezium-JSON messages publish to mq(kafka) system. This is useful in many cases to leverage this feature, such as compatible with the debezium ecosystem. # How To Use ## MySQL-CDC Sink Kafka ```bash env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 15000 } source { MySQL-CDC { plugin_output = "table1" url="jdbc:mysql://localhost:3306/test" "startup.mode"=INITIAL table-names=[ "database1.t1", "database1.t2", "database2.t1" ] # compatible_debezium_json options format = compatible_debezium_json debezium = { # include schema into kafka message key.converter.schemas.enable = false value.converter.schemas.enable = false # topic prefix database.server.name = "mysql_cdc_1" } } } sink { Kafka { plugin_input = "table1" bootstrap.servers = "localhost:9092" topic = "${topic}" # compatible_debezium_json options format = compatible_debezium_json } } ``` ================================================ FILE: docs/en/connectors/formats/debezium-json.md ================================================ # Debezium Format Changelog-Data-Capture Format: Serialization Schema Format: Deserialization Schema Debezium is a set of distributed services to capture changes in your databases so that your applications can see those changes and respond to them. Debezium records all row-level changes within each database table in a *change event stream*, and applications simply read these streams to see the change events in the same order in which they occurred. Seatunnel supports to interpret Debezium JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as synchronizing incremental data from databases to other systems auditing logs real-time materialized views on databases temporal join changing history of a database table and so on. Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel asDebezium JSON messages, and emit to storage like Kafka. # Format Options | Option | Default | Required | Description | |-----------------------------------|---------|----------|------------------------------------------------------------------------------------------------------| | format | (none) | yes | Specify what format to use, here should be 'debezium_json'. | | debezium-json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | # How To Use ## Kafka Uses example Debezium provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: ```bash { "before": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter ", "weight": 5.18 }, "after": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter ", "weight": 5.17 }, "source": { "version": "1.1.1.Final", "connector": "mysql", "name": "dbserver1", "ts_ms": 1589362330000, "snapshot": "false", "db": "inventory", "table": "products", "server_id": 223344, "gtid": null, "file": "mysql-bin.000003", "pos": 2090, "row": 0, "thread": 2, "query": null }, "op": "u", "ts_ms": 1589362330904, "transaction": null } ``` Note: please refer to [Debezium documentation](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#data-change-events) about the meaning of each fields. The MySQL products table has 4 columns (id, name, description and weight). The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.17. Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel conf to consume this topic and interpret the change events by Debezium format. **In this config, you must specify the `schema` and `debezium_record_include_schema` options ** - `schema` should same with your table format - if your json data contains `schema` field, `debezium_record_include_schema` should be true, and if your json data doesn't contains `schema` field, `debezium_record_include_schema` should be false - `{"schema" : {}, "payload": { "before" : {}, "after": {} ... } }` --> `true` - `{"before" : {}, "after": {} ... }` --> `false` ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } } debezium_record_include_schema = false format = debezium_json } } transform { } sink { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "consume-binlog" format = debezium_json } } ``` ================================================ FILE: docs/en/connectors/formats/kafka-compatible-kafkaconnect-json.md ================================================ # Kafka source compatible kafka-connect-json Seatunnel connector kafka supports parsing data extracted through kafka connect source, especially data extracted from kafka connect jdbc and kafka connect debezium # How To Use ## Kafka Sink Mysql ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "localhost:9092" topic = "jdbc_source_record" plugin_output = "kafka_table" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = COMPATIBLE_KAFKA_CONNECT_JSON } } sink { Jdbc { driver = com.mysql.cj.jdbc.Driver url = "jdbc:mysql://localhost:3306/seatunnel" user = st_user password = seatunnel generate_sink_sql = true database = seatunnel table = jdbc_sink primary_keys = ["id"] } } ``` ================================================ FILE: docs/en/connectors/formats/maxwell-json.md ================================================ # MaxWell Format [Maxwell](https://maxwells-daemon.io/) is a CDC (Changelog Data Capture) tool that can stream changes in real-time from MySQL into Kafka, Kinesis and other streaming connectors. Maxwell provides a unified format schema for changelog and supports to serialize messages using JSON. Seatunnel supports to interpret MaxWell JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as synchronizing incremental data from databases to other systems auditing logs real-time materialized views on databases temporal join changing history of a database table and so on. Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel as MaxWell JSON messages, and emit to storage like Kafka. However, currently Seatunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Seatunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT MaxWell messages. # Format Options | Option | Default | Required | Description | |----------------------------------|---------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | format | (none) | yes | Specify what format to use, here should be 'maxwell_json'. | | maxwell_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | | maxwell_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the MaxWell record. The pattern string is compatible with Java's Pattern. | | maxwell_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the MaxWell record. The pattern string is compatible with Java's Pattern. | # How To Use MaxWell format ## Kafka Uses Example MaxWell provides a unified format for changelog, here is a simple example for an update operation captured from a MySQL products table: ```bash { "database":"test", "table":"product", "type":"insert", "ts":1596684904, "xid":7201, "commit":true, "data":{ "id":111, "name":"scooter", "description":"Big 2-wheel scooter ", "weight":5.18 }, "primary_key_columns":[ "id" ] } ``` Note: please refer to MaxWell documentation about the meaning of each fields. The MySQL products table has 4 columns (id, name, description and weight). The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.15. Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel to consume this topic and interpret the change events. ```bash env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = maxwell_json } } transform { } sink { Kafka { bootstrap.servers = "localhost:9092" topic = "consume-binlog" format = maxwell_json } } ``` ================================================ FILE: docs/en/connectors/formats/ogg-json.md ================================================ # Ogg Format [Oracle GoldenGate](https://www.oracle.com/integration/goldengate/) (a.k.a ogg) is a managed service providing a real-time data mesh platform, which uses replication to keep data highly available, and enabling real-time analysis. Customers can design, execute, and monitor their data replication and stream data processing solutions without the need to allocate or manage compute environments. Ogg provides a format schema for changelog and supports to serialize messages using JSON. Seatunnel supports to interpret Ogg JSON messages as INSERT/UPDATE/DELETE messages into seatunnel system. This is useful in many cases to leverage this feature, such as synchronizing incremental data from databases to other systems auditing logs real-time materialized views on databases temporal join changing history of a database table and so on. Seatunnel also supports to encode the INSERT/UPDATE/DELETE messages in Seatunnel as Ogg JSON messages, and emit to storage like Kafka. However, currently Seatunnel can’t combine UPDATE_BEFORE and UPDATE_AFTER into a single UPDATE message. Therefore, Seatunnel encodes UPDATE_BEFORE and UPDATE_AFTER as DELETE and INSERT Ogg messages. # Format Options | Option | Default | Required | Description | |------------------------------|---------|----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | format | (none) | yes | Specify what format to use, here should be '-json'. | | ogg_json.ignore-parse-errors | false | no | Skip fields and rows with parse errors instead of failing. Fields are set to null in case of errors. | | ogg_json.database.include | (none) | no | An optional regular expression to only read the specific databases changelog rows by regular matching the "database" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | | ogg_json.table.include | (none) | no | An optional regular expression to only read the specific tables changelog rows by regular matching the "table" meta field in the Canal record. The pattern string is compatible with Java's Pattern. | # How to Use Ogg format ## Kafka Uses Example Ogg provides a unified format for changelog, here is a simple example for an update operation captured from a Oracle products table: ```bash { "before": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter", "weight": 5.18 }, "after": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter", "weight": 5.15 }, "op_type": "U", "op_ts": "2020-05-13 15:40:06.000000", "current_ts": "2020-05-13 15:40:07.000000", "primary_keys": [ "id" ], "pos": "00000000000000000000143", "table": "PRODUCTS" } ``` Note: please refer to [Debezium documentation](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#data-change-events) about the meaning of each fields. The Oracle products table has 4 columns (id, name, description and weight). The above JSON message is an update change event on the products table where the weight value of the row with id = 111 is changed from 5.18 to 5.15. Assuming the messages have been synchronized to Kafka topic products_binlog, then we can use the following Seatunnel to consume this topic and interpret the change events. ```bash env { parallelism = 1 job.mode = "STREAMING" } source { Kafka { bootstrap.servers = "127.0.0.1:9092" topic = "ogg" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "double" } }, format = ogg_json } } sink { jdbc { url = "jdbc:mysql://127.0.0.1/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "12345678" table = "ogg" primary_keys = ["id"] } } ``` ================================================ FILE: docs/en/connectors/formats/protobuf.md ================================================ # Protobuf Format Protobuf (Protocol Buffers) is a language-neutral, platform-independent data serialization format developed by Google. It provides an efficient way to encode structured data and supports multiple programming languages and platforms. Currently, Protobuf format can be used with Kafka. ## Kafka Usage Example - Example of simulating a randomly generated data source and writing it to Kafka in Protobuf format ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_int32 = int c_int64 = long c_float = float c_double = double c_bool = boolean c_string = string c_bytes = bytes Address { city = string state = string street = string } attributes = "map" phone_numbers = "array" } } } } sink { kafka { topic = "test_protobuf_topic_fake_source" bootstrap.servers = "kafkaCluster:9092" format = protobuf kafka.request.timeout.ms = 60000 kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ } } ``` - Example of reading data from Kafka in Protobuf format and printing it to the console ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Kafka { topic = "test_protobuf_topic_fake_source" format = protobuf protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ schema = { fields { c_int32 = int c_int64 = long c_float = float c_double = double c_bool = boolean c_string = string c_bytes = bytes Address { city = string state = string street = string } attributes = "map" phone_numbers = "array" } } bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } sink { Console { plugin_input = "kafka_table" } } ``` ================================================ FILE: docs/en/connectors/sink/Activemq.md ================================================ import ChangeLog from '../changelog/connector-activemq.md'; # Activemq > Activemq sink connector ## Description Used to write data to Activemq. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------------------------|---------|----------|---------------| | host | string | no | - | | port | int | no | - | | virtual_host | string | no | - | | username | string | no | - | | password | string | no | - | | queue_name | string | yes | - | | uri | string | yes | - | | check_for_duplicate | boolean | no | - | | client_id | boolean | no | - | | copy_message_on_send | boolean | no | - | | disable_timeStamps_by_default | boolean | no | - | | use_compression | boolean | no | - | | always_session_async | boolean | no | - | | dispatch_async | boolean | no | - | | nested_map_and_list_enabled | boolean | no | - | | warnAboutUnstartedConnectionTimeout | boolean | no | - | | closeTimeout | int | no | - | ### host [string] the default host to use for connections ### port [int] the default port to use for connections ### username [string] the AMQP user name to use when connecting to the broker ### password [string] the password to use when connecting to the broker ### uri [string] convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host ### queue_name [string] the queue to write the message to ### check_for_duplicate [boolean] will check for duplucate messages ### client_id [string] client id ### copy_message_on_send [boolean] if true, enables new JMS Message object as part of the send method ### disable_timeStamps_by_default [boolean] disables timestamp for slight performance boost ### use_compression [boolean] Enables the use of compression on the message’s body. ### always_session_async [boolean] When true a separate thread is used for dispatching messages for each Session in the Connection. ### always_sync_send [boolean] When true a MessageProducer will always use Sync sends when sending a Message ### close_timeout [boolean] Sets the timeout, in milliseconds, before a close is considered complete. ### dispatch_async [boolean] Should the broker dispatch messages asynchronously to the consumer ### nested_map_and_list_enabled [boolean] Controls whether Structured Message Properties and MapMessages are supported ### warn_about_unstarted_connection_timeout [int] The timeout, in milliseconds, from the time of connection creation to when a warning is generated ## Example simple: ```hocon sink { ActiveMQ { uri="tcp://localhost:61616" username = "admin" password = "admin" queue_name = "test1" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Aerospike.md ================================================ import ChangeLog from '../changelog/connector-aerospike.md'; # Aerospike > Aerospike sink connector ## Support Those Engines > Spark
> Flink
> Seatunnel Zeta
## License Compatibility Notice This connector depends on Aerospike Client Library which is licensed under AGPL 3.0. When using this connector, you need to comply with AGPL 3.0 license terms. ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Sink connector for Aerospike database. ## Supported DataSource Info | Datasource | Supported Versions | Maven | |------------|-----------------|----------------------------------------------------------------------------------------| | Aerospike | 4.4.17+ | [Download](https://mvnrepository.com/artifact/com.aerospike/aerospike-client) | ## Data Type Mapping | SeaTunnel Data Type | Aerospike Data Type | Storage Format | |---------------------|---------------------|--------------------------------------------------------------------------------| | STRING | STRING | Direct string storage | | INT | INTEGER | 32-bit integer | | BIGINT | LONG | 64-bit integer | | DOUBLE | DOUBLE | 64-bit floating point | | BOOLEAN | BOOLEAN | Stored as true/false values | | ARRAY | BYTEARRAY | Only support byte array type | | LIST | LIST | Support generic list types | | DATE | LONG | Converted to epoch milliseconds | | TIMESTAMP | LONG | Converted to epoch milliseconds | Note: - When using ARRAY type, SeaTunnel's array elements must be byte type - LIST type supports any element types that can be serialized - DATE/TIMESTAMP conversion uses system default time zone ## Options | Name | Type | Required | Default | Description | |----------------|--------|----------|---------|-----------------------------------------------------------------------------| | host | string | Yes | - | Aerospike server hostname or IP address | | port | int | No | 3000 | Aerospike server port | | namespace | string | Yes | - | Namespace in Aerospike | | set | string | Yes | - | Set name in Aerospike | | username | string | No | - | Username for authentication | | password | string | No | - | Password for authentication | | key | string | Yes | - | Field name to use as Aerospike primary key | | bin_name | string | No | - | Bin name for storing data | | data_format | string | No | string | Data storage format: map/string/kv | | write_timeout | int | No | 200 | Write operation timeout in milliseconds | | schema.field | map | No | {} | Field type mappings (e.g. {"name":"STRING","age":"INTEGER"}) | ### data_format Options - **map**: Store data as JSON map - **string**: Store data as JSON string - **kv**: Store each field as separate bin ## Task Example ### Simple Example ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 10 schema = { fields { id = "int" name = "string" age = "int" address = "string" } } } } sink { Aerospike { host = "localhost" port = 3000 namespace = "test_namespace" set = "user_data" key = "id" data_format = "map" write_timeout = 300 schema.field = { id = "INTEGER" name = "STRING" age = "INTEGER" address = "STRING" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Airtable.md ================================================ import ChangeLog from '../changelog/connector-http-airtable.md'; # Airtable > Airtable sink connector ## Description Used to write data to Airtable. ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [ ] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | token | String | Yes | - | | base_id | String | Yes | - | | table | String | Yes | - | | api_base_url | String | No | https://api.airtable.com | | typecast | boolean | No | false | | batch_size | int | No | 10 | | request_interval_ms | int | No | 220 | | rate_limit_backoff_ms | int | No | 30000 | | rate_limit_max_retries | int | No | 3 | | common-options | | No | - | ### token [String] Airtable personal access token. You can create one at https://airtable.com/create/tokens. ### base_id [String] The ID of the Airtable base (starts with `app`). ### table [String] The table name or table ID to write to. ### api_base_url [String] Airtable API base URL. Default is `https://api.airtable.com`. ### typecast [boolean] If true, Airtable will automatically convert values to match the field type. Default false. ### batch_size [int] Number of records per API request. Maximum 10 per Airtable API limit. Default 10. ### request_interval_ms [int] Minimum interval in milliseconds between API requests. Default 220ms. ### rate_limit_backoff_ms [int] Base backoff time in milliseconds when receiving a 429 (rate limit) response. Default 30000ms. ### rate_limit_max_retries [int] Maximum number of retries after receiving a 429 response. Default 3. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example ```hocon sink { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" typecast = true batch_size = 10 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/AmazonDynamoDB.md ================================================ import ChangeLog from '../changelog/connector-amazondynamodb.md'; # AmazonDynamoDB > Amazon DynamoDB sink connector ## Description Write data to Amazon DynamoDB ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default value | |-------------------|--------|----------|---------------| | url | string | yes | - | | region | string | yes | - | | access_key_id | string | yes | - | | secret_access_key | string | yes | - | | table | string | yes | - | | batch_size | string | no | 25 | | common-options | | no | - | ### url [string] The URL to write to Amazon DynamoDB. ### region [string] The region of Amazon DynamoDB. ### access_key_id [string] The access id of Amazon DynamoDB. ### secret_access_key [string] The access secret of Amazon DynamoDB. ### table [string] The table of Amazon DynamoDB. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example ```bash Amazondynamodb { url = "http://127.0.0.1:8000" region = "us-east-1" access_key_id = "dummy-key" secret_access_key = "dummy-secret" table = "TableName" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/AmazonSqs.md ================================================ import ChangeLog from '../changelog/connector-amazonsqs.md'; # AmazonSqs > Amazon SQS sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data to Amazon SQS ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Sink Options | Name | Type | Required | Default | Description | |-------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The Queue URL to read from Amazon SQS. | | region | String | No | - | The AWS region for the SQS service | | format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | | format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | | field_delimiter | String | No | , | Customize the field delimiter for data format. | ## Task Example ```bash source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } plugin_output = "fake" } } sink { AmazonSqs { url = "http://127.0.0.1:8000" region = "us-east-1" queue = "queueName" format = text field_delimiter = "|" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Assert.md ================================================ import ChangeLog from '../changelog/connector-assert.md'; # Assert > Assert sink connector ## Description A sink plugin which can assert illegal data by user defined rules ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default | |------------------------------------------------------------------------------------------------|-------------------------------------------------|----------|---------| | rules | ConfigMap | yes | - | | rules.field_rules | string | yes | - | | rules.field_rules.field_name | string\|ConfigMap | yes | - | | rules.field_rules.field_type | string | no | - | | rules.field_rules.field_value | ConfigList | no | - | | rules.field_rules.field_value.rule_type | string | no | - | | rules.field_rules.field_value.rule_value | numeric | no | - | | rules.field_rules.field_value.equals_to | boolean\|numeric\|string\|ConfigList\|ConfigMap | no | - | | rules.row_rules | string | yes | - | | rules.row_rules.rule_type | string | no | - | | rules.row_rules.rule_value | string | no | - | | rules.catalog_table_rule | ConfigMap | no | - | | rules.catalog_table_rule.primary_key_rule | ConfigMap | no | - | | rules.catalog_table_rule.primary_key_rule.primary_key_name | string | no | - | | rules.catalog_table_rule.primary_key_rule.primary_key_columns | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_name | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_type | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_column_name | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_sort_type | string | no | - | | rules.catalog_table_rule.column_rule | ConfigList | no | - | | rules.catalog_table_rule.column_rule.name | string | no | - | | rules.catalog_table_rule.column_rule.type | string | no | - | | rules.catalog_table_rule.column_rule.column_length | int | no | - | | rules.catalog_table_rule.column_rule.nullable | boolean | no | - | | rules.catalog_table_rule.column_rule.default_value | string | no | - | | rules.catalog_table_rule.column_rule.comment | comment | no | - | | rules.table-names | ConfigList | no | - | | rules.tables_configs | ConfigList | no | - | | rules.tables_configs.table_path | String | no | - | | common-options | | no | - | ### rules [ConfigMap] Rule definition of user's available data. Each rule represents one field validation or row num validation. ### field_rules [ConfigList] field rules for field validation ### field_name [string] field name(string) ### field_type [string | ConfigMap] Field type declarations should adhere to this [guide](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported). ### field_value [ConfigList] A list value rule define the data value validation ### rule_type [string] The following rules are supported for now - NOT_NULL `value can't be null` - NULL `value can be null` - MIN `define the minimum value of data` - MAX `define the maximum value of data` - MIN_LENGTH `define the minimum string length of a string data` - MAX_LENGTH `define the maximum string length of a string data` - MIN_ROW `define the minimun number of rows` - MAX_ROW `define the maximum number of rows` ### rule_value [numeric] The value related to rule type. When the `rule_type` is `MIN`, `MAX`, `MIN_LENGTH`, `MAX_LENGTH`, `MIN_ROW` or `MAX_ROW`, users need to assign a value to the `rule_value`. ### equals_to [boolean | numeric | string | ConfigList | ConfigMap] `equals_to` is used to compare whether the field value is equal to the configured expected value. You can assign values of all types to `equals_to`. These types are detailed [here](../../introduction/concepts/schema-feature.md#what-type-supported-at-now). For instance, if one field is a row with three fields, and the declaration of row type is `{a = array, b = map, c={c_0 = int, b = string}}`, users can assign the value `[["a", "b"], { k0 = 9999.99, k1 = 111.11 }, [123, "abcd"]]` to `equals_to`. > The way of defining field values is consistent with [FakeSource](../source/FakeSource.md#customize-the-data-content-simple). > > `equals_to` cannot be applied to `null` type fields. However, users can use the rule type `NULL` for verification, such as `{rule_type = NULL}`. ### catalog_table_rule [ConfigMap] Used to assert the catalog table is same with the user defined table. ### table-names [ConfigList] Used to assert the table should be in the data. ### tables_configs [ConfigList] Used to assert the multiple tables should be in the data. ### table_path [String] The path of the table. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ### Simple the whole config obey with `hocon` style ```hocon Assert { rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 10 }, { rule_type = MIN_ROW rule_value = 5 } ], field_rules = [{ field_name = name field_type = string field_value = [ { rule_type = NOT_NULL }, { rule_type = MIN_LENGTH rule_value = 5 }, { rule_type = MAX_LENGTH rule_value = 10 } ] }, { field_name = age field_type = int field_value = [ { rule_type = NOT_NULL equals_to = 23 }, { rule_type = MIN rule_value = 32767 }, { rule_type = MAX rule_value = 2147483647 } ] } ] catalog_table_rule { primary_key_rule = { primary_key_name = "primary key" primary_key_columns = ["id"] } constraint_key_rule = [ { constraint_key_name = "unique_name" constraint_key_type = UNIQUE_KEY constraint_key_columns = [ { constraint_key_column_name = "id" constraint_key_sort_type = ASC } ] } ] column_rule = [ { name = "id" type = bigint }, { name = "name" type = string }, { name = "age" type = int } ] } } } ``` ### Complex Here is a more complex example about `equals_to`. The example involves FakeSource. You may want to learn it, please read this [document](../source/FakeSource.md). ```hocon source { FakeSource { row.num = 1 schema = { fields { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" c_map_nest = "map" c_row = { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" } } } rows = [ { kind = INSERT fields = [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], "{ 12:01:26 = v0 }", { k1 = [123, "BBB-BB"]}, [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], { k0 = v0 } ] ] } ] plugin_output = "fake" } } sink{ Assert { plugin_input = "fake" rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 1 }, { rule_type = MIN_ROW rule_value = 1 } ], field_rules = [ { field_name = c_null field_type = "null" field_value = [ { rule_type = NULL } ] }, { field_name = c_string field_type = string field_value = [ { rule_type = NOT_NULL equals_to = "AAA" } ] }, { field_name = c_boolean field_type = boolean field_value = [ { rule_type = NOT_NULL equals_to = false } ] }, { field_name = c_tinyint field_type = tinyint field_value = [ { rule_type = NOT_NULL equals_to = 1 } ] }, { field_name = c_smallint field_type = smallint field_value = [ { rule_type = NOT_NULL equals_to = 1 } ] }, { field_name = c_int field_type = int field_value = [ { rule_type = NOT_NULL equals_to = 333 } ] }, { field_name = c_bigint field_type = bigint field_value = [ { rule_type = NOT_NULL equals_to = 323232 } ] }, { field_name = c_float field_type = float field_value = [ { rule_type = NOT_NULL equals_to = 3.1 } ] }, { field_name = c_double field_type = double field_value = [ { rule_type = NOT_NULL equals_to = 9.33333 } ] }, { field_name = c_decimal field_type = "decimal(30, 8)" field_value = [ { rule_type = NOT_NULL equals_to = 99999.99999999 } ] }, { field_name = c_date field_type = date field_value = [ { rule_type = NOT_NULL equals_to = "2012-12-21" } ] }, { field_name = c_timestamp field_type = timestamp field_value = [ { rule_type = NOT_NULL equals_to = "2012-12-21T12:34:56" } ] }, { field_name = c_time field_type = time field_value = [ { rule_type = NOT_NULL equals_to = "12:34:56" } ] }, { field_name = c_bytes field_type = bytes field_value = [ { rule_type = NOT_NULL equals_to = "bWlJWmo=" } ] }, { field_name = c_array field_type = "array" field_value = [ { rule_type = NOT_NULL equals_to = [0, 1, 2] } ] }, { field_name = c_map field_type = "map" field_value = [ { rule_type = NOT_NULL equals_to = "{ 12:01:26 = v0 }" } ] }, { field_name = c_map_nest field_type = "map" field_value = [ { rule_type = NOT_NULL equals_to = { k1 = [123, "BBB-BB"] } } ] }, { field_name = c_row field_type = { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" } field_value = [ { rule_type = NOT_NULL equals_to = [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], { k0 = v0 } ] } ] } ] } } } ``` ### Assert Multiple Tables check multiple tables ```hocon env { parallelism = 1 job.mode = BATCH } source { FakeSource { tables_configs = [ { row.num = 16 schema { table = "test.table1" fields { c_int = int c_bigint = bigint } } }, { row.num = 17 schema { table = "test.table2" fields { c_string = string c_tinyint = tinyint } } } ] } } transform { } sink { Assert { rules = { tables_configs = [ { table_path = "test.table1" row_rules = [ { rule_type = MAX_ROW rule_value = 16 }, { rule_type = MIN_ROW rule_value = 16 } ], field_rules = [{ field_name = c_int field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = c_bigint field_type = bigint field_value = [ { rule_type = NOT_NULL } ] }] }, { table_path = "test.table2" row_rules = [ { rule_type = MAX_ROW rule_value = 17 }, { rule_type = MIN_ROW rule_value = 17 } ], field_rules = [{ field_name = c_string field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = c_tinyint field_type = tinyint field_value = [ { rule_type = NOT_NULL } ] }] } ] } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Cassandra.md ================================================ import ChangeLog from '../changelog/connector-cassandra.md'; # Cassandra > Cassandra sink connector ## Description Write data to Apache Cassandra. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------|---------|----------|---------------| | host | String | Yes | - | | keyspace | String | Yes | - | | table | String | Yes | - | | username | String | No | - | | password | String | No | - | | datacenter | String | No | datacenter1 | | consistency_level | String | No | LOCAL_ONE | | fields | Array | No | - | | batch_size | int | No | 5000 | | batch_type | String | No | UNLOGGED | | async_write | boolean | No | true | ### host [string] `Cassandra` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"cassandra1:9042,cassandra2:9042"`. ### keyspace [string] The `Cassandra` keyspace. ### table [String] The `Cassandra` table name. ### username [string] `Cassandra` user username. ### password [string] `Cassandra` user password. ### datacenter [String] The `Cassandra` datacenter, default is `datacenter1`. ### consistency_level [String] The `Cassandra` write consistency level, default is `LOCAL_ONE`. ### fields [array] The data field that needs to be output to `Cassandra` , if not configured, it will be automatically adapted according to the sink table `schema`. ### batch_size [number] The number of rows written through [Cassandra-Java-Driver](https://github.com/datastax/java-driver) each time, default is `5000`. ### batch_type [String] The `Cassandra` batch processing mode, default is `UNLOGGER`. ### async_write [boolean] Whether `cassandra` writes in asynchronous mode, default is `true`. ## Examples ```hocon sink { Cassandra { host = "localhost:9042" username = "cassandra" password = "cassandra" datacenter = "datacenter1" keyspace = "test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Clickhouse.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # Clickhouse > Clickhouse sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication. - [x] [support multiple table sink](../../introduction/concepts/connector-v2-features.md) ## Description Used to write data to Clickhouse. ## Supported DataSource Info In order to use the Clickhouse connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------------| | Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-clickhouse) | ## Data Type Mapping | SeaTunnel Data Type | Clickhouse Data Type | |---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| | STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | | INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | | BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | | DOUBLE | Float64 | | DECIMAL | Decimal | | FLOAT | Float32 | | DATE | Date | | TIME | DateTime | | ARRAY | Array | | MAP | Map | ## Sink Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"`. | | database | String | Yes | - | The `ClickHouse` database. | | table | String | Yes | - | The table name. | | username | String | Yes | - | `ClickHouse` user username. | | password | String | Yes | - | `ClickHouse` user password. | | clickhouse.config | Map | No | | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | | bulk_size | String | No | 20000 | The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000`. | | split_mode | String | No | false | This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option-should be `true`.They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will counted. | | sharding_key | String | No | - | When use split_mode, which node to send data to is a problem, the default is random selection, but the 'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only worked when 'split_mode' is true. | | primary_key | String | No | - | Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table. | | support_upsert | Boolean | No | false | Support upsert row by query primary key. | | allow_experimental_lightweight_delete | Boolean | No | false | Allow experimental lightweight delete based on `*MergeTree` table engine. | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Schema save mode. Please refer to the `schema_save_mode` section below. | | data_save_mode | Enum | no | APPEND_DATA | Data save mode. Please refer to the `data_save_mode` section below. | | custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | save_mode_create_template | string | no | see below | See below. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. | ### schema_save_mode [Enum] Before starting the synchronization task, choose different processing options for the existing table schema. Option descriptions: `RECREATE_SCHEMA`: Create the table if it does not exist; drop and recreate the table when saving. `CREATE_SCHEMA_WHEN_NOT_EXIST`: Create the table if it does not exist; skip if the table already exists. `ERROR_WHEN_SCHEMA_NOT_EXIST`: Throw an error if the table does not exist. `IGNORE`: Ignore the processing of the table. ### data_save_mode [Enum] Before starting the synchronization task, choose different processing options for the existing data on the target side. Option descriptions: `DROP_DATA`: Retain the database schema but delete the data. `APPEND_DATA`: Retain the database schema and the data. `CUSTOM_PROCESSING`: Custom user-defined processing. `ERROR_WHEN_DATA_EXISTS`: Throw an error if data exists. ### save_mode_create_template Automatically create Clickhouse tables using templates. The table creation statements will be generated based on the upstream data types and schema. The default template can be modified as needed. Default template: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE = MergeTree() ORDER BY (${rowtype_primary_key}) PRIMARY KEY (${rowtype_primary_key}) SETTINGS index_granularity = 8192 COMMENT '${comment}'; ``` If custom fields are added to the template, for example, adding an `id` field: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( id, ${rowtype_fields} ) ENGINE = MergeTree() ORDER BY (${rowtype_primary_key}) PRIMARY KEY (${rowtype_primary_key}) SETTINGS index_granularity = 8192 COMMENT '${comment}'; ``` The connector will automatically retrieve the corresponding types from the upstream source and fill in the template, removing the `id` field from the `rowtype_fields`. This method can be used to modify custom field types and attributes. The following placeholders can be used: - `database`: Retrieves the database from the upstream schema. - `table_name`: Retrieves the table name from the upstream schema. - `rowtype_fields`: Retrieves all fields from the upstream schema and automatically maps them to Clickhouse field descriptions. - `rowtype_primary_key`: Retrieves the primary key from the upstream schema (this may be a list). - `rowtype_unique_key`: Retrieves the unique key from the upstream schema (this may be a list). - `comment`: Retrieves the table comment from the upstream schema. ## Example Configurations and Cases ### How to Create a Clickhouse Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that writes randomly generated data to a Clickhouse database: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 1000 } source { FakeSource { row.num = 2 bigint.min = 0 bigint.max = 10000000 split.num = 1 split.read-interval = 300 schema { fields { c_bigint = bigint } } } } sink { Clickhouse { host = "127.0.0.1:9092" database = "default" table = "test" username = "xxxxx" password = "xxxxx" } } ``` > Tips: > > 1.[SeaTunnel Deployment Document](../../getting-started/locally/deployment.md).
> 2.The table to be written to needs to be created in advance before synchronization.
> 3.When sink is writing to the ClickHouse table, you don't need to set its schema because the connector will query ClickHouse for the current table's schema information before writing.
### Clickhouse Sink Config ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" clickhouse.config = { max_rows_to_read = "100" read_overflow_mode = "throw" } } } ``` ### Split Mode ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # split mode options split_mode = true sharding_key = "age" } } ``` ### CDC(Change data capture) Sink ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # cdc options primary_key = "id" support_upsert = true } } ``` ### CDC(Change data capture) for *MergeTree engine ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # cdc options primary_key = "id" support_upsert = true allow_experimental_lightweight_delete = true } } ``` ### Multiple table Sink Cases In ClickHouse, create the following two data tables in advance: ``` create table if not exists `default`.multi_sink_table1( `c_string` String, `c_boolean` Boolean, `c_tinyint` Int8, `c_smallint` Int16, `c_int` Int32, `c_bigint` Int64, `c_float` Float32, `c_double` Float64, `c_decimal` Decimal(30, 8), `c_date` Date, `c_time` DateTime64, `c_map` Map(String, Int32), `c_array` Array(Int32) )engine=Memory comment '''N''-N'; create table if not exists `default`.multi_sink_table2 as `default`.multi_sink_table1; ``` Then, the configuration to be used is referred to as follows: ``` env { parallelism = 1 job.mode = "BATCH" job.name = "fake_to_clickhouse_with_multi_table" } source { FakeSource { tables_configs = [ { schema = { table = "multi_sink_table1" fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = timestamp c_map = "map" c_array = "array" } } row.num = 100 }, { schema = { table = "multi_sink_table2" fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = timestamp c_map = "map" c_array = "array" } } row.num = 100 } ] plugin_output = "multi_sink_table" } } sink { Clickhouse { plugin_input = "multi_sink_table" host = "clickhouse:8123" database = "default" table = "${table_name}" username = "default" password = "" } } ``` After submitting the job and successfully executing it, we can see that the data volume of the ClickHouse data tables `multi_sink_table1` and `multi_sink_table2` is 100 for each. ## Changelog ================================================ FILE: docs/en/connectors/sink/ClickhouseFile.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # ClickhouseFile > Clickhouse file sink connector ## Description Generate the clickhouse data file with the clickhouse-local program, and then send it to the clickhouse server, also call bulk load. This connector only support clickhouse table which engine is 'Distributed'.And `internal_replication` option should be `true`. Supports Batch and Streaming mode. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) :::tip Write data to Clickhouse can also be done using JDBC ::: ## Options | Name | Type | Required | Default | |------------------------|---------|----------|----------------------------------------| | host | string | yes | - | | database | string | yes | - | | table | string | yes | - | | username | string | yes | - | | password | string | yes | - | | clickhouse_local_path | string | yes | - | | sharding_key | string | no | - | | copy_method | string | no | scp | | node_free_password | boolean | no | false | | node_pass | list | no | - | | node_pass.node_address | string | no | - | | node_pass.username | string | no | "root" | | node_pass.password | string | no | - | | compatible_mode | boolean | no | false | | file_fields_delimiter | string | no | "\t" | | file_temp_path | string | no | "/tmp/seatunnel/clickhouse-local/file" | | key_path | string | no | "/tmp/id_rsa" | | common-options | | no | - | ### host [string] `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . ### database [string] The `ClickHouse` database ### table [string] The table name ### username [string] `ClickHouse` user username ### password [string] `ClickHouse` user password ### sharding_key [string] When ClickhouseFile split data, which node to send data to is a problem, the default is random selection, but the 'sharding_key' parameter can be used to specify the field for the sharding algorithm. ### clickhouse_local_path [string] The address of the clickhouse-local program on the spark node. Since each task needs to be called, clickhouse-local should be located in the same path of each spark node. ### copy_method [string] Specifies the method used to transfer files, the default is scp, optional scp and rsync ### node_free_password [boolean] Because seatunnel need to use scp or rsync for file transfer, seatunnel need clickhouse server-side access. If each spark node and clickhouse server are configured with password-free login, you can configure this option to true, otherwise you need to configure the corresponding node password in the node_pass configuration ### node_pass [list] Used to save the addresses and corresponding passwords of all clickhouse servers ### node_pass.node_address [string] The address corresponding to the clickhouse server ### node_pass.username [string] The username corresponding to the clickhouse server, default root user. ### node_pass.password [string] The password corresponding to the clickhouse server. ### compatible_mode [boolean] In the lower version of Clickhouse, the ClickhouseLocal program does not support the `--path` parameter, you need to use this mode to take other ways to realize the `--path` parameter function ### file_fields_delimiter [string] ClickhouseFile uses csv format to temporarily save data. If the data in the row contains the delimiter value of csv, it may cause program exceptions. Avoid this with this configuration. Value string has to be an exactly one character long ### file_temp_path [string] The directory where ClickhouseFile stores temporary files locally. ### key_path [string] The path of the private key file used for scp or rsync to connect to the ClickHouse server. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Examples ```hocon ClickhouseFile { host = "192.168.0.1:8123" database = "default" table = "fake_all" username = "default" password = "" clickhouse_local_path = "/Users/seatunnel/Tool/clickhouse local" sharding_key = "age" node_free_password = false node_pass = [{ node_address = "192.168.0.1" password = "seatunnel" }] } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Cloudberry.md ================================================ import ChangeLog from '../changelog/connector-cloudberry.md'; # Cloudberry > JDBC Cloudberry Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through JDBC. Cloudberry currently does not have its own native driver. It uses PostgreSQL's driver for connectivity and follows PostgreSQL's implementation. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|------------------------------------------|------------------------|---------------------------------------|--------------------------------------------------------------------------| | Cloudberry | Uses PostgreSQL driver implementation | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | ## Database Dependency > Please download the PostgreSQL driver jar and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping Cloudberry uses PostgreSQL's data type implementation. Please refer to PostgreSQL documentation for data type compatibility and mappings. ## Options Cloudberry connector uses the same options as PostgreSQL. For detailed configuration options, please refer to the PostgreSQL documentation. Key options include: - url (required): The JDBC connection URL - driver (required): The driver class name (org.postgresql.Driver) - user/password: Authentication credentials - query or database/table combination: What data to write and how - is_exactly_once: Enable exactly-once semantics with XA transactions - batch_size: Control batch writing behavior ## Task Example ### Simple ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "insert into test_table(name,age) values(?,?)" } } ``` ### Generate Sink SQL ```hocon sink { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "public.test_table" } } ``` ### Exactly-once ```hocon sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" } } ``` ### CDC(Change Data Capture) Event ```hocon sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "sink_table" primary_keys = ["id","name"] field_ide = UPPERCASE } } ``` ### Save mode function ```hocon sink { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "public.test_table" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` For more detailed examples and options, please refer to the PostgreSQL connector documentation. ## Changelog ================================================ FILE: docs/en/connectors/sink/Console.md ================================================ import ChangeLog from '../changelog/connector-console.md'; # Console > Console sink connector ## Support Connector Version - All versions ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to send data to Console. Both support streaming and batch mode. > For example, if the data from upstream is [`age: 12, name: jared`], the content send to console is the following: `{"name":"jared","age":17}` ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default | Description | |--------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------| | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | log.print.data | boolean | No | - | Flag to determine whether data should be printed in the logs. The default value is `true` | | log.print.delay.ms | int | No | - | Delay in milliseconds between printing each data item to the logs. The default value is `0`. | ## Task Example ### Simple > This is a randomly generated data, written to the console, with a degree of parallelism of 1 ``` env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake" schema = { fields { name = "string" age = "int" } } } } sink { Console { plugin_input = "fake" } } ``` ### Multiple Sources Simple > This is a multiple source and you can specify a data source to write to the specified end ``` env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake1" schema = { fields { id = "int" name = "string" age = "int" sex = "string" } } } FakeSource { plugin_output = "fake2" schema = { fields { name = "string" age = "int" } } } } sink { Console { plugin_input = "fake1" } Console { plugin_input = "fake2" } } ``` ## Console Sample Data This is a printout from our console ``` 2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age 2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/CosFile.md ================================================ import ChangeLog from '../changelog/connector-file-cos.md'; # CosFile > Cos file sink connector ## Description Output data to cos file system. :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. ::: ## Key Features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | | | tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a COS dir. | | bucket | string | yes | - | | | secret_id | string | yes | - | | | secret_key | string | yes | - | | | region | string | yes | - | | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### path [string] The target dir path is required. ### bucket [string] The bucket address of cos file system, for example: `cosn://seatunnel-test-1259587829` ### secret_id [string] The secret id of cos file system. ### secret_key [string] The secret key of cos file system. ### region [string] The region of cos file system. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Example For text file format with `have_partition` and `custom_filename` and `sink_columns` ```hocon CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` For parquet file format with `have_partition` and `sink_columns` ```hocon CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] } ``` For orc file format simple config ```bash CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "orc" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/DB2.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DB2 > JDBC DB2 Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| | DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | ## Data Type Mapping | DB2 Data Type | SeaTunnel Data Type | |------------------------------------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | SMALLINT | SHORT | | INT
INTEGER
| INTEGER | | BIGINT | LONG | | DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | | REAL | FLOAT | | FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | | CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | | BLOB | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | ROWID
XML | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DB2 is `com.db2.cj.jdbc.Db2XADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your DB2. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### Exactly-once > For accurate write scene we guarantee accurate once ``` sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.db2.cj.jdbc.Db2XADataSource" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Databend.md ================================================ import ChangeLog from '../changelog/connector-databend.md'; # Databend > Databend sink connector ## Supported Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [Support Multi-table Writing](../../introduction/concepts/connector-v2-features.md) - [x] [Exactly-Once](../../introduction/concepts/connector-v2-features.md) - [x] [CDC](../../introduction/concepts/connector-v2-features.md) - [x] [Parallelism](../../introduction/concepts/connector-v2-features.md) ## Description A sink connector for writing data to Databend. Supports both batch and streaming processing modes. The Databend sink internally implements bulk data import through stage attachment. ## Dependencies ### For Spark/Flink > 1. You need to download the [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) and add it to the directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta > 1. You need to download the [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) and add it to the directory `${SEATUNNEL_HOME}/lib/`. ## Sink Options | Name | Type | Required | Default Value | Description | |---------------------|------|----------|---------------|---------------------------------------------| | url | String | Yes | - | Databend JDBC connection URL | | username | String | Yes | - | Databend database username | | password | String | Yes | - | Databend database password | | database | String | No | - | Databend database name, defaults to the database name specified in the connection URL | | table | String | No | - | Databend table name | | batch_size | Integer | No | 1000 | Number of records for batch writing | | auto_commit | Boolean | No | true | Whether to auto-commit transactions | | max_retries | Integer | No | 3 | Maximum retry attempts on write failure | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Schema save mode | | data_save_mode | Enum | No | APPEND_DATA | Data save mode | | custom_sql | String | No | - | Custom write SQL, typically used for complex write scenarios | | execute_timeout_sec | Integer | No | 300 | SQL execution timeout (seconds) | | jdbc_config | Map | No | - | Additional JDBC connection configuration, such as connection timeout parameters | | conflict_key | String | No | - | Conflict key for CDC mode, used to determine the primary key for conflict resolution | | enable_delete | Boolean | No | false | Whether to allow delete operations in CDC mode | ### schema_save_mode [Enum] Before starting the synchronization task, choose different processing schemes for existing table structures. Option descriptions: `RECREATE_SCHEMA`: Create when table doesn't exist, drop and recreate when table exists. `CREATE_SCHEMA_WHEN_NOT_EXIST`: Create when table doesn't exist, skip when table exists. `ERROR_WHEN_SCHEMA_NOT_EXIST`: Report error when table doesn't exist. `IGNORE`: Ignore table processing. ### data_save_mode [Enum] Before starting the synchronization task, choose different processing schemes for existing data on the target side. Option descriptions: `DROP_DATA`: Retain database structure and delete data. `APPEND_DATA`: Retain database structure and data. `CUSTOM_PROCESSING`: User-defined processing. `ERROR_WHEN_DATA_EXISTS`: Report error when data exists. ## Data Type Mapping | SeaTunnel Data Type | Databend Data Type | |-----------------|---------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | STRING | STRING | | BYTES | VARBINARY | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | ## Task Examples ### Simple Example ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { row.num = 10 schema = { fields { name = string age = int score = double } } } } sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" batch_size = 1000 } } ``` ### Writing with Custom SQL ```hocon sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" custom_sql = "INSERT INTO default.target_table(name, age, score) VALUES(?, ?, ?)" } } ``` ### Using Schema Save Mode ```hocon sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" schema_save_mode = "RECREATE_SCHEMA" data_save_mode = "APPEND_DATA" } } ``` ### CDC mode ```hocon sink { Databend { url = "jdbc:databend://databend:8000/default?ssl=false" username = "root" password = "" database = "default" table = "sink_table" # Enable CDC mode batch_size = 1 conflict_key = "id" enable_delete = true } } ``` ## Related Links - [Databend Official Website](https://databend.rs/) - [Databend JDBC Driver](https://github.com/databendlabs/databend-jdbc/) ## Changelog ================================================ FILE: docs/en/connectors/sink/Datahub.md ================================================ import ChangeLog from '../changelog/connector-datahub.md'; # DataHub > DataHub sink connector ## Description A sink plugin which use send message to DataHub ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------|--------|----------|---------------| | endpoint | string | yes | - | | accessId | string | yes | - | | accessKey | string | yes | - | | project | string | yes | - | | topic | string | yes | - | | timeout | int | no | 3000 | | retryTimes | int | no | 3 | | common-options | | no | - | ### endpoint [string] your DataHub endpoint start with http (string) ### accessId [string] your DataHub accessId which cloud be access from Alibaba Cloud (string) ### accessKey [string] your DataHub accessKey which cloud be access from Alibaba Cloud (string) ### project [string] your DataHub project which is created in Alibaba Cloud (string) ### topic [string] your DataHub topic (string) ### timeout [int] the max connection timeout (int) ### retryTimes [int] the max retry times when your client put record failed (int) ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ```hocon sink { DataHub { endpoint="yourendpoint" accessId="xxx" accessKey="xxx" project="projectname" topic="topicname" timeout=3000 retryTimes=3 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/DingTalk.md ================================================ import ChangeLog from '../changelog/connector-dingtalk.md'; # DingTalk > DinkTalk sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Description A sink plugin which use DingTalk robot send message ## Options | name | type | required | default value | |----------------|--------|----------|---------------| | url | String | yes | - | | secret | String | yes | - | | common-options | | no | - | ### url [String] DingTalk robot address format is https://oapi.dingtalk.com/robot/send?access_token=XXXXXX(String) ### secret [String] DingTalk robot secret (String) ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ```hocon sink { DingTalk { url="https://oapi.dingtalk.com/robot/send?access_token=ec646cccd028d978a7156ceeac5b625ebd94f586ea0743fa501c100007890" secret="SEC093249eef7aa57d4388aa635f678930c63db3d28b2829d5b2903fc1e5c10000" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Doris.md ================================================ import ChangeLog from '../changelog/connector-doris.md'; # Doris > Doris sink connector ## Support Doris Version - exactly-once & cdc supported `Doris version is >= 1.1.x` - Array data type supported `Doris version is >= 1.2.x` - Map data type will be support in `Doris version is 2.x` ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to Doris. Both support streaming and batch mode. The internal implementation of Doris sink connector is cached and imported by stream load in batches. ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Sink Options | Name | Type | Required | Default | Description | |--------------------------------|---------|----------|------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | fenodes | String | Yes | - | `Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."` | | query-port | int | No | 9030 | `Doris` Fenodes query_port | | username | String | Yes | - | `Doris` user username | | password | String | Yes | - | `Doris` user password | | database | String | Yes | - | The database name of `Doris` table, use `${database_name}` to represent the upstream table name | | table | String | Yes | - | The table name of `Doris` table, use `${table_name}` to represent the upstream table name | | table.identifier | String | Yes | - | The name of `Doris` table, it will deprecate after version 2.3.5, please use `database` and `table` instead. | | sink.label-prefix | String | Yes | - | The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. | | sink.enable-2pc | bool | No | false | Whether to enable two-phase commit (2pc), the default is false. For two-phase commit, please refer to [here](https://doris.apache.org/docs/data-operate/transaction?_highlight=two&_highlight=phase#stream-load-2pc). | | sink.enable-delete | bool | No | - | Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this [link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/) | | sink.check-interval | int | No | 10000 | check exception with the interval while loading | | sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | | sink.buffer-size | int | No | 256 * 1024 | the buffer size to cache data for stream load. | | sink.buffer-count | int | No | 3 | the buffer count to cache data for stream load. | | doris.batch.size | int | No | 1024 | the batch size of the write to doris each http request, when the row reaches the size or checkpoint is executed, the data of cached will write to server. | | needs_unsupported_type_casting | boolean | No | false | Whether to enable the unsupported type casting, such as Decimal64 to Double | | case_sensitive | boolean | No | true | Whether to preserve the original case of table and column names. When set to false, table and column names will be converted to lowercase. | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | the schema save mode, please refer to `schema_save_mode` below | | data_save_mode | Enum | no | APPEND_DATA | the data save mode, please refer to `data_save_mode` below | | save_mode_create_template | string | no | see below | see below | | custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | doris.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `CUSTOM_PROCESSING`:User defined processing `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ### save_mode_create_template We use templates to automatically create Doris tables, which will create corresponding table creation statements based on the type of upstream data and schema type, and the default template can be modified according to the situation. Default template: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP UNIQUE KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_allocation" = "tag.location.default: 1", "in_memory" = "false", "storage_format" = "V2", "disable_auto_compaction" = "false" ) ``` If a custom field is filled in the template, such as adding an `id` field ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( id, ${rowtype_fields} ) ENGINE = OLAP UNIQUE KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1" ); ``` The connector will automatically obtain the corresponding type from the upstream to complete the filling, and remove the id field from `rowtype_fields`. This method can be used to customize the modification of field types and attributes. You can use the following placeholders - database: Used to get the database in the upstream schema - table_name: Used to get the table name in the upstream schema - rowtype_fields: Used to get all the fields in the upstream schema, we will automatically map to the field description of Doris - rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) - rowtype_unique_key: Used to get the unique key in the upstream schema (maybe a list) - rowtype_duplicate_key: Used to get the duplicate key in the upstream schema (only for doris source, maybe a list) - comment: Used to get the table comment in the upstream schema ## Data Type Mapping | Doris Data Type | SeaTunnel Data Type | |-----------------|-----------------------------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT
TINYINT | | INT | INT
SMALLINT
TINYINT | | BIGINT | BIGINT
INT
SMALLINT
TINYINT | | LARGEINT | BIGINT
INT
SMALLINT
TINYINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE
FLOAT | | DECIMAL | DECIMAL
DOUBLE
FLOAT | | DATE | DATE | | DATETIME | TIMESTAMP | | CHAR | STRING | | VARCHAR | STRING | | STRING | STRING | | ARRAY | ARRAY | | MAP | MAP | | JSON | STRING | | HLL | Not supported yet | | BITMAP | Not supported yet | | QUANTILE_STATE | Not supported yet | | STRUCT | Not supported yet | #### Supported import data formats The supported formats include CSV and JSON ## Tuning Guide Appropriately increasing the value of `sink.buffer-size` and `doris.batch.size` can increase the write performance. In stream mode, if the `doris.batch.size` and `checkpoint.interval` are both configured with a large value, The last data to arrive may have a large delay(The delay time is the checkpoint interval). This is because the total amount of data arriving at the end may not exceed the threshold specified by `doris.batch.size`. Therefore, commit can only be triggered by checkpoint before the volume of received data does not exceed this threshold. Therefore, you should select an appropriate `checkpoint.interval`. Otherwise, if you enable the 2pc by the property `sink.enable-2pc=true`.The `sink.buffer-size` will have no effect. So only the checkpoint can trigger the commit. ## Task Example ### Simple > The following example describes writing multiple data types to Doris, and users need to create corresponding tables downstream ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### CDC(Change Data Capture) Event > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Doris Sink,FakeSource simulates CDC data with schema, score (int type),Doris needs to create a table sink named test.e2e_table_sink and a corresponding table for it. ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int sex = boolean number = tinyint height = float sight = double create_time = date update_time = timestamp } } rows = [ { kind = INSERT fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = UPDATE_BEFORE fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = DELETE fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] } ] } } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### Use JSON format to import data ``` sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.enable-2pc = "true" sink.label-prefix = "test_json" doris.config = { format="json" read_json_by_line="true" } } } ``` ### Use CSV format to import data ``` sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.enable-2pc = "true" sink.label-prefix = "test_csv" doris.config = { format = "csv" column_separator = "," } } } ### Case-Sensitive Configuration ```hocon sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "Test_DB" # Original case will be preserved table = "Test_Table" # Original case will be preserved case_sensitive = true # Default value, preserves original case sink.enable-2pc = "true" sink.label-prefix = "test_case_sensitive" doris.config = { format = "json" read_json_by_line = "true" } } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "${database_name}_test" table = "${table_name}_test" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "${schema_name}_test" table = "${table_name}_test" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Druid.md ================================================ import ChangeLog from '../changelog/connector-druid.md'; # Druid > Druid sink connector ## Description Write data to Druid ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Data Type Mapping | SeaTunnel Data Type | Druid Data Type | |---------------------|-----------------| | TINYINT | LONG | | SMALLINT | LONG | | INT | LONG | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DOUBLE | | STRING | STRING | | BOOLEAN | STRING | | TIMESTAMP | STRING | ## Options | name | type | required | default value | |----------------|--------|----------|---------------| | coordinatorUrl | string | yes | - | | datasource | string | yes | - | | batchSize | int | no | 10000 | | common-options | | no | - | ### coordinatorUrl [string] The coordinatorUrl host and port of Druid, example: "myHost:8888" ### datasource [string] The datasource name you want to write, example: "seatunnel" ### batchSize [int] The number of rows flushed to Druid per batch. Default value is `1024`. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example Simple example: ```hocon sink { Druid { coordinatorUrl = "testHost:8888" datasource = "seatunnel" } } ``` Use placeholders get upstream table metadata example: ```hocon sink { Druid { coordinatorUrl = "testHost:8888" datasource = "${table_name}_test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/DuckDB.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DuckDB > JDBC DuckDB Sink Connector ## Support DuckDB Version - 0.8.x/0.9.x/0.10.x/1.x ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|-------------------------|----------------------------------|-----------------------------------------------------------------------| | DuckDB | Different dependency version has different driver class. | org.duckdb.DuckDBDriver | jdbc:duckdb:/path/to/database.db | [Download](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) | ## Data Type Mapping | SeaTunnel Data Type | DuckDB Data Type | |---------------------------------------------------------------------|------------------| | BOOLEAN | BOOLEAN | | TINYINT
SMALLINT
INT | INTEGER | | BIGINT | BIGINT | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | VARCHAR | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | BYTES
ARRAY
ROW
MAP | BLOB | ## Sink Options | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:duckdb:/path/to/database.db | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DuckDB the value is `org.duckdb.DuckDBDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | main | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DuckDB is `org.duckdb.DuckDBXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed; `UPPERCASE` indicates conversion to uppercase; `LOWERCASE` indicates conversion to lowercase. | | properties | Map | No | - | Additional connection configuration parameters, when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in DuckDB, properties take precedence over the URL. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../sink-common-options.md) for details | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | | data_save_mode | Enum | No | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | | custom_sql | String | No | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task only has `insert`, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple ``` env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 row_num = 1000 schema = { fields { id = "int" name = "string" age = "int" email = "string" } } } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "" password = "" } } ``` ### CDC(Change data capture) event ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { base-url = "jdbc:mysql://localhost:3306/test" username = "root" password = "123456" table-names = ["test.user"] } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "" password = "" generate_sink_sql = true # You need to configure both database and table database = main table = "sink_table" primary_keys = ["id"] } } ``` ### Exactly-once ``` env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 row_num = 1000 schema = { fields { id = "int" name = "string" age = "int" email = "string" } } } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "" password = "" is_exactly_once = "true" xa_data_source_class_name = "org.duckdb.DuckDBXADataSource" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Easysearch.md ================================================ import ChangeLog from '../changelog/connector-easysearch.md'; # INFINI Easysearch ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description A sink plugin which use send data to `INFINI Easysearch`. ## Using Dependency > Depenndency [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) > ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) :::tip Engine Supported * Supported all versions released by [INFINI Easysearch](https://www.infini.com/download/?product=easysearch). ::: ## Data Type Mapping | Easysearch Data Type | SeaTunnel Data Type | |-----------------------------|----------------------| | STRING
KEYWORD
TEXT | STRING | | BOOLEAN | BOOLEAN | | BYTE | BYTE | | SHORT | SHORT | | INTEGER | INT | | LONG | LONG | | FLOAT
HALF_FLOAT | FLOAT | | DOUBLE | DOUBLE | | Date | LOCAL_DATE_TIME_TYPE | ## Sink Options | name | type | required | default value | |------------------------|---------|----------|---------------| | hosts | array | yes | - | | index | string | yes | - | | primary_keys | list | no | | | key_delimiter | string | no | `_` | | username | string | no | | | password | string | no | | | max_retry_count | int | no | 3 | | max_batch_size | int | no | 10 | | tls_verify_certificate | boolean | no | true | | tls_verify_hostname | boolean | no | true | | tls_keystore_path | string | no | - | | tls_keystore_password | string | no | - | | tls_truststore_path | string | no | - | | tls_truststore_password | string | no | - | | schema_save_mode | enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | enum | no | APPEND_DATA | | common-options | | no | - | ### hosts [array] `INFINI Easysearch` cluster http address, the format is `host:port` , allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. ### index [string] `INFINI Easysearch` `index` name.Index support contains variables of field name,such as `seatunnel_${age}`,and the field must appear at seatunnel row. If not, we will treat it as a normal index. ### primary_keys [list] Primary key fields used to generate the document `_id`, this is cdc required options. ### key_delimiter [string] Delimiter for composite keys ("_" by default), e.g., "$" would result in document `_id` "KEY1$KEY2$KEY3". ### username [string] security username ### password [string] security password ### max_retry_count [int] one bulk request max try size ### max_batch_size [int] batch bulk doc max size ### tls_verify_certificate [boolean] Enable certificates validation for HTTPS endpoints ### tls_verify_hostname [boolean] Enable hostname validation for HTTPS endpoints ### tls_keystore_path [string] The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. ### tls_keystore_password [string] The key password for the key store specified ### tls_truststore_path [string] The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. ### tls_truststore_password [string] The key password for the trust store specified ### schema_save_mode [enum] Choose how to handle the target-side schema before starting the synchronization task: - `RECREATE_SCHEMA`: Creates the table if it doesn't exist, and deletes and recreates it if it does. - `CREATE_SCHEMA_WHEN_NOT_EXIST`: Creates the table if it doesn't exist, skips creation if it does. - `ERROR_WHEN_SCHEMA_NOT_EXIST`: Throws an error if the table doesn't exist. - `IGNORE`: Ignores schema handling. ### data_save_mode [enum] Choose how to handle the target-side data before starting the synchronization task: - `DROP_DATA`: Preserves the database structure and deletes the data. - `APPEND_DATA`: Preserves the database structure and the data. - `ERROR_WHEN_DATA_EXISTS`: Reports an error when data exists. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Examples Simple ```bash sink { Easysearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" } } ``` CDC(Change data capture) event ```bash sink { Easysearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" # cdc required options primary_keys = ["key1", "key2", ...] } } ``` SSL (Disable certificates validation) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_certificate = false } } ``` SSL (Disable hostname validation) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_hostname = false } } ``` SSL (Enable certificates validation) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` SAVE_MODE ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Elasticsearch.md ================================================ import ChangeLog from '../changelog/connector-elasticsearch.md'; # Elasticsearch ## Description Output data to `Elasticsearch`. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) :::tip Engine Supported * supported `ElasticSearch version is >= 2.x and <= 8.x` ::: ## Options | name | type | required | default value | |-------------------------|---------|----------|------------------------------| | hosts | array | yes | - | | index | string | yes | - | | schema_save_mode | string | yes | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | string | yes | APPEND_DATA | | index_type | string | no | | | primary_keys | list | no | | | key_delimiter | string | no | `_` | | auth_type | string | no | basic | | username | string | no | | | password | string | no | | | auth.api_key_id | string | no | - | | auth.api_key | string | no | - | | auth.api_key_encoded | string | no | - | | max_retry_count | int | no | 3 | | max_batch_size | int | no | 10 | | tls_verify_certificate | boolean | no | true | | tls_verify_hostname | boolean | no | true | | tls_keystore_path | string | no | - | | tls_keystore_password | string | no | - | | tls_truststore_path | string | no | - | | tls_truststore_password | string | no | - | | common-options | | no | - | | vectorization_fields | array | no | - | | vector_dimensions | int | no | - | ### hosts [array] `Elasticsearch` cluster http address, the format is `host:port` , allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. ### index [string] `Elasticsearch` `index` name.Index support contains variables of field name,such as `seatunnel_${age}`(Need to configure schema_save_mode="IGNORE"),and the field must appear at seatunnel row. If not, we will treat it as a normal index. ### index_type [string] `Elasticsearch` index type, it is recommended not to specify in elasticsearch 6 and above ### primary_keys [list] Primary key fields used to generate the document `_id`, this is cdc required options. ### key_delimiter [string] Delimiter for composite keys ("_" by default), e.g., "$" would result in document `_id` "KEY1$KEY2$KEY3". ## Authentication The Elasticsearch connector supports multiple authentication methods to connect to secured Elasticsearch clusters. You can choose the appropriate authentication method based on your Elasticsearch security configuration. ### auth_type [enum] Specifies the authentication method to use. Supported values: - `basic` (default): HTTP Basic Authentication using username and password - `api_key`: Elasticsearch API Key authentication using separate ID and key - `api_key_encoded`: Elasticsearch API Key authentication using encoded key If not specified, defaults to `basic` for backward compatibility. ### Basic Authentication Basic authentication uses HTTP Basic Authentication with username and password credentials. #### username [string] Username for basic authentication (x-pack username). #### password [string] Password for basic authentication (x-pack password). ### vectorization_fields [array] Field names that require vector conversion, supported by Elasticsearch 7.3 and later versions ### vector_dimensions [int] Vector dimension, supported by Elasticsearch 7.3 and later versions **Example:** ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "basic" username = "elastic" password = "your_password" index = "my_index" } } ``` ### API Key Authentication API Key authentication provides a more secure way to authenticate with Elasticsearch using API keys. #### auth.api_key_id [string] The API key ID generated by Elasticsearch. #### auth.api_key [string] The API key secret generated by Elasticsearch. #### auth.api_key_encoded [string] Base64 encoded API key in the format `base64(id:api_key)`. This is an alternative to specifying `auth.api_key_id` and `auth.api_key` separately. **Note:** You can use either `auth.api_key_id` + `auth.api_key` OR `auth.api_key_encoded`, but not both. **Example with separate ID and key:** ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key" auth.api_key_id = "your_api_key_id" auth.api_key = "your_api_key_secret" index = "my_index" } } ``` **Example with encoded key:** ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key_encoded" auth.api_key_encoded = "eW91cl9hcGlfa2V5X2lkOnlvdXJfYXBpX2tleV9zZWNyZXQ=" index = "my_index" } } ``` ### max_retry_count [int] one bulk request max try size ### vectorization_fields [array] fields to embeddings ### vector_dimensions [int] embeddings dimensions ### max_batch_size [int] batch bulk doc max size ### tls_verify_certificate [boolean] Enable certificates validation for HTTPS endpoints ### tls_verify_hostname [boolean] Enable hostname validation for HTTPS endpoints ### tls_keystore_path [string] The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. ### tls_keystore_password [string] The key password for the key store specified ### tls_truststore_path [string] The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. ### tls_truststore_password [string] The key password for the trust store specified ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ### schema_save_mode Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ## Examples Simple ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" schema_save_mode="IGNORE" } } ``` Multi-table writing ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" } } ``` vector-field writing ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" vectorization_fields = ["review_embedding"] vector_dimensions = 1024 } } ``` CDC(Change data capture) event ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" schema_save_mode="IGNORE" # cdc required options primary_keys = ["key1", "key2", ...] } } ``` CDC(Change data capture) event Multi-table writing ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" primary_keys = ["${primary_key}"] } } ``` SSL (Disable certificates validation) ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false } } ``` SSL (Disable hostname validation) ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_hostname = false } } ``` SSL (Enable certificates validation) ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` SAVE_MODE ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ### Schema Evolution CDC collection supports a limited number of schema changes. The currently supported schema changes include: * Adding columns. ### Schema Evolution ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second = 7000000 read_limit.rows_per_second = 400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "schema_change_index" index_type = "_doc" "schema_save_mode" = "CREATE_SCHEMA_WHEN_NOT_EXIST" "data_save_mode" = "APPEND_DATA" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Email.md ================================================ import ChangeLog from '../changelog/connector-email.md'; # Email > Email sink connector ## Description Send the data as a file to email. The tested email version is 1.5.6. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------------------|---------|----------|---------------| | email_from_address | string | yes | - | | email_to_address | string | yes | - | | email_host | string | yes | - | | email_transport_protocol | string | yes | - | | email_smtp_auth | boolean | yes | - | | email_smtp_port | int | no | 465 | | email_authorization_code | string | no | - | | email_message_headline | string | yes | - | | email_message_content | string | yes | - | | email_attachment_name | string | no | emailsink.csv | | email_field_delimiter | string | no | , | | common-options | | no | - | ### email_from_address [string] Sender Email Address. ### email_to_address [string] Address to receive mail, Support multiple email addresses, separated by commas (,). ### email_host [string] SMTP server to connect to. ### email_transport_protocol [string] The protocol to load the session . ### email_smtp_auth [boolean] Whether to authenticate the customer. ### email_smtp_port [int] Select port for authentication. ### email_authorization_code [string] authorization code,You can obtain the authorization code from the mailbox Settings. ### email_message_headline [string] The subject line of the entire message. ### email_message_content [string] The body of the entire message. ### email_attachment_name [string] The name of the email attachment file. Default is `emailsink.csv`. ### email_field_delimiter [string] The delimiter used to separate fields in the attachment file. Default is comma `,`. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example ```bash EmailSink { email_from_address = "xxxxxx@qq.com" email_to_address = "xxxxxx@163.com" email_host="smtp.qq.com" email_transport_protocol="smtp" email_smtp_auth="true" email_authorization_code="" email_message_headline="" email_message_content="" email_attachment_name="report.csv" # Optional, default is emailsink.csv email_field_delimiter="|" # Optional, default is , } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Enterprise-WeChat.md ================================================ import ChangeLog from '../changelog/connector-http-wechat.md'; # Enterprise WeChat > Enterprise WeChat sink connector ## Description A sink plugin which use Enterprise WeChat robot send message > For example, if the data from upstream is [`"alarmStatus": "firing", "alarmTime": "2022-08-03 01:38:49","alarmContent": "The disk usage exceeds the threshold"`], the output content to WeChat Robot is the following: > > ``` > alarmStatus: firing > alarmTime: 2022-08-03 01:38:49 > alarmContent: The disk usage exceeds the threshold > ``` > > **Tips: WeChat sink only support `string` webhook and the data from source will be treated as body content in web hook.** ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------|--------|----------|---------------| | url | String | Yes | - | | mentioned_list | array | No | - | | mentioned_mobile_list | array | No | - | | common-options | | no | - | ### url [string] Enterprise WeChat webhook url format is https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXXXXX(string) ### mentioned_list [array] A list of userids to remind the specified members in the group (@ a member), @ all means to remind everyone. If the developer can't get the userid, he can use called_ mobile_ list ### mentioned_mobile_list [array] Mobile phone number list, remind the group member corresponding to the mobile phone number (@ a member), @ all means remind everyone ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example simple: ```hocon WeChat { url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" } ``` ```hocon WeChat { url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" mentioned_list=["wangqing","@all"] mentioned_mobile_list=["13800001111","@all"] } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Feishu.md ================================================ import ChangeLog from '../changelog/connector-http-feishu.md'; # Feishu > Feishu sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Used to launch Feishu web hooks using data. > For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}` **Tips: Feishu sink only support `post json` webhook and the data from source will be treated as body content in web hook.** ## Data Type Mapping | Seatunnel Data Type | Feishu Data Type | |-----------------------------|------------------| | ROW
MAP | Json | | NULL | null | | BOOLEAN | boolean | | TINYINT | byte | | SMALLINT | short | | INT | int | | BIGINT | long | | FLOAT | float | | DOUBLE | double | | DECIMAL | BigDecimal | | BYTES | byte[] | | STRING | String | | TIME
TIMESTAMP
TIME | String | | ARRAY | JsonArray | ## Sink Options | Name | Type | Required | Default | Description | |----------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Feishu webhook url | | headers | Map | No | - | Http request headers | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Task Example ### Simple ```hocon Feishu { url = "https://www.feishu.cn/flow/api/trigger-webhook/108bb8f208d9b2378c8c7aedad715c19" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Fluss.md ================================================ import ChangeLog from '../changelog/connector-fluss.md'; # Fluss > Fluss sink connector ## Support These Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to Fluss. Both support streaming and batch mode. ## Using Dependency com.alibaba.fluss fluss-client 0.7.0 ## Sink Options | Name | Type | Required | Default | Description | |-------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | bootstrap.servers | string | yes | - | The bootstrap servers for the Fluss sink connection. | | database | string | no | - | The name of Fluss database, If not set, the table name will be the name of the upstream db | | table | string | no | - | The name of Fluss table, If not set, the table name will be the name of the upstream table | | client.config | Map | no | - | set other client config. Please refer to https://fluss.apache.org/docs/engine-flink/options/#other-options | ### database [string] The name of Fluss database, If not set, the table name will be the name of the upstream db for example: 1. test_${schema_name}_test 2. sink_sinkdb 3. ss_${database_name} ### table [string] The name of Fluss table, If not set, the table name will be the name of the upstream table for example: 1. test_${table_name}_test 2. sink_sinktable 3. ss_${table_name} ## Data Type Mapping | StarRocks Data type | Fluss Data type | |---------------------|-----------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DOUBLE | DOUBLE | | BYTES | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | TIMESTAMP_TZ | TIMESTAMP_TZ | | STRING | STRING | ## Task Example ### Simple ```hocon env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 tables_configs = [ { row.num = 7 schema { table = "test.table1" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] } ] } } transform { } sink { Fluss { bootstrap.servers="fluss_coordinator_e2e:9123" database = "fluss_db_${database_name}" table = "fluss_tb_${table_name}" } } ``` ### Multiple table ```hocon env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 tables_configs = [ { row.num = 7 schema { table = "test2.table1" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] }, { row.num = 7 schema { table = "test2.table2" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] }, { row.num = 7 schema { table = "test3.table3" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] } ] } } transform { } sink { Fluss { bootstrap.servers="fluss_coordinator_e2e:9123" database = "fluss_db_${database_name}" table = "fluss_tb_${table_name}" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/FtpFile.md ================================================ import ChangeLog from '../changelog/connector-file-ftp.md'; # FtpFile > Ftp file sink connector ## Description Output data to Ftp . :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ::: ## Key features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary ## Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | string | yes | - | | | port | int | yes | - | | | user | string | yes | - | | | password | string | yes | - | | | path | string | yes | - | | | tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a FTP dir. | | connection_mode | string | no | active_local | The target ftp connection mode | | remote_verification_enabled | boolean | no | true | Whether to enable remote host verification for FTP data channels | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format_type is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format_type is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format_type is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method | | data_save_mode | string | no | APPEND_DATA | Existing data processing method | ### host [string] The target ftp host is required ### port [int] The target ftp port is required ### user [string] The target ftp username is required ### password [string] The target ftp password is required ### path [string] The target dir path is required. ### connection_mode [string] The target ftp connection mode , default is active mode, supported as the following modes: `active_local` `passive_local` ### remote_verification_enabled [boolean] Whether to enable remote host verification for FTP data channels, default is `true`. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be wrote to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### schema_save_mode [string] Existing dir processing method. - RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist - CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist - ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist - IGNORE :Ignore the treatment of the table ### data_save_mode [string] Existing data processing method. - DROP_DATA: preserve dir and delete data files - APPEND_DATA: preserve dir, preserve data files - ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported ## Example For text file format simple config ```bash FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" sink_columns = ["name","age"] } ``` For text file format with `have_partition` and `custom_filename` and `sink_columns` ```bash FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp/seatunnel/job1" tmp_path = "/data/ftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" sink_columns = ["name","age"] filename_time_format = "yyyy.MM.dd" } ``` When our source end is multiple tables, and wants different expressions to different directory, we can configure this way ```hocon FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp/seatunnel/job1/${table_name}" tmp_path = "/data/ftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" sink_columns = ["name","age"] filename_time_format = "yyyy.MM.dd" schema_save_mode=RECREATE_SCHEMA data_save_mode=DROP_DATA } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/GoogleFirestore.md ================================================ import ChangeLog from '../changelog/connector-google-firestore.md'; # GoogleFirestore > Google Firestore sink connector ## Description Write data to Google Firestore ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------|--------|----------|---------------| | project_id | string | yes | - | | collection | string | yes | - | | credentials | string | no | - | ### project_id [string] The unique identifier for a Google Firestore database project. ### collection [string] The collection of Google Firestore. ### credentials [string] The credentials of Google Cloud service account, use base64 codec. If not set, need to check the `GOOGLE APPLICATION CREDENTIALS` environment exists. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example ```bash GoogleFirestore { project_id = "dummy-project-id", collection = "dummy-collection", credentials = "dummy-credentials" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/GraphQL.md ================================================ import ChangeLog from '../changelog/connector-graphql.md'; # GraphQL > GraphQL sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to launch web hooks using data. > For example, if the data from upstream is [`label: {"__name__": "test1"}, value: 1.2.3,time:2024-08-15T17:00:00`], the body content is the following: `{"label":{"__name__": "test1"}, "value":"1.23","time":"2024-08-15T17:00:00"}` **Tips: GraphQL sink only support `post json` webhook and the data from source will be treated as body content in web hook.And does not support passing past data** ## Supported DataSource Info In order to use the GraphQL connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------------------------------------| | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-http) | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url | | query | String | Yes | - | GraphQL query | | variables | String | No | - | GraphQL variables | | valueCover | Boolean | No | - | Whether the data overwrites the variable value | | headers | Map | No | - | Http headers | | retry | Int | No | - | The max retry times if request http return to `IOException` | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | | socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../sink-common-options.md) for details | ## Example simple: ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "graphql_sink_1" fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] }, { schema = { table = "graphql_sink_2" fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] } ] } } sink { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" query = """ mutation MyMutation( $id: Int! $val_bool: Boolean! $val_int8: smallint! $val_int16: smallint! $val_int32: Int! $val_int64: bigint! $val_float: Float! $val_double: Float! $val_decimal: numeric! $val_string: String! $val_unixtime_micros: timestamp! ) { insert_sink(objects: { id: $id, val_bool: $val_bool, val_int8: $val_int8, val_int16: $val_int16, val_int32: $val_int32, val_int64: $val_int64, val_float: $val_float, val_double: $val_double, val_decimal: $val_decimal, val_string: $val_string, val_unixtime_micros: $val_unixtime_micros }) { affected_rows returning { id val_bool val_decimal val_double val_float val_int16 val_int32 val_int64 val_int8 val_string val_unixtime_micros } } } """ variables = { "val_bool": True } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Greenplum.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Greenplum > Greenplum sink connector ## Description Write data to Greenplum using [Jdbc connector](Jdbc.md). ## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) :::tip Not support exactly-once semantics (XA transaction is not yet supported in Greenplum database). ::: ## Options ### driver [string] Optional jdbc drivers: - `org.postgresql.Driver` - `com.pivotal.jdbc.GreenplumDriver` Warn: for license compliance, if you use `GreenplumDriver` the have to provide Greenplum JDBC driver yourself, e.g. copy greenplum-xxx.jar to $SEATUNNEL_HOME/lib for Standalone. ### url [string] The URL of the JDBC connection. if you use postgresql driver the value is `jdbc:postgresql://${yous_host}:${yous_port}/${yous_database}`, or you use greenplum driver the value is `jdbc:pivotal:greenplum://${yous_host}:${yous_port};DatabaseName=${yous_database}` ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Changelog ================================================ FILE: docs/en/connectors/sink/Hbase.md ================================================ import ChangeLog from '../changelog/connector-hbase.md'; # Hbase > Hbase sink connector ## Description Output data to Hbase ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------------|---------|----------|-----------------| | zookeeper_quorum | string | yes | - | | table | string | yes | - | | rowkey_column | list | yes | - | | family_name | config | yes | - | | rowkey_delimiter | string | no | "" | | version_column | string | no | - | | null_mode | string | no | skip | | wal_write | boolean | yes | false | | write_buffer_size | string | no | 8 * 1024 * 1024 | | encoding | string | no | utf8 | | hbase_extra_config | config | no | - | | common-options | | no | - | | ttl | long | no | - | ### zookeeper_quorum [string] The zookeeper cluster host of hbase, example: "hadoop001:2181,hadoop002:2181,hadoop003:2181" ### table [string] The table name you want to write, example: "seatunnel" If your table is under a custom namespace, use `namespace:table` (for example, `ns1:seatunnel_test`); if omitted, SeaTunnel will write to HBase's default namespace (`default`). ### rowkey_column [list] The column name list of row keys, example: ["id", "uuid"] ### family_name [config] The family name mapping of fields. For example the row from upstream like the following shown: | id | name | age | |----|---------------|-----| | 1 | tyrantlucifer | 27 | id as the row key and other fields written to the different families, you can assign family_name { name = "info1" age = "info2" } this means that `name` will be written to the family `info1` and the `age` will be written to the family `info2` if you want other fields written to the same family, you can assign family_name { all_columns = "info" } this means that all fields will be written to the family `info` ### rowkey_delimiter [string] The delimiter of joining multi row keys, default `""` ### version_column [string] The version column name, you can use it to assign timestamp for hbase record ### null_mode [double] The mode of writing null value, support [`skip`, `empty`], default `skip` - skip: When the field is null, connector will not write this field to hbase - empty: When the field is null, connector will write generate empty value for this field ### wal_write [boolean] The wal log write flag, default `false` ### write_buffer_size [int] The write buffer size of hbase client, default `8 * 1024 * 1024` ### encoding [string] The encoding used for STRING/DECIMAL/DATE/TIME/TIMESTAMP/ARRAY fields, support [`utf8`, `gbk`], default `utf8` ### Data types Hbase stores bytes. The connector supports: - TINYINT/SMALLINT/INT/BIGINT/FLOAT/DOUBLE/BOOLEAN/BYTES - STRING/DECIMAL/DATE/TIME/TIMESTAMP/ARRAY (serialized as strings using `encoding`) ### hbase_extra_config [config] The extra configuration of hbase ### ttl [long] Hbase writes data TTL time, the default is based on the TTL set in the table, unit: milliseconds ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ```hocon Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "seatunnel_test" rowkey_column = ["name"] family_name { all_columns = seatunnel } } ``` ## Kerberos Example Note: - `connector-hbase` does not parse `krb5_path`, `kerberos_principal`, or `kerberos_keytab_path`. - Prepare Kerberos credentials and `krb5.conf` in the runtime environment (for example, `kinit -kt ...` or JVM `-Djava.security.krb5.conf=...`), and put HBase/Hadoop security settings into `hbase_extra_config`. ```hocon sink { Hbase { zookeeper_quorum = "zk1:2181,zk2:2181,zk3:2181" table = "target_table" rowkey_column = ["rowkey"] family_name { all_columns = "info" } # HBase security config hbase_extra_config = { "hbase.security.authentication" = "kerberos" "hadoop.security.authentication" = "kerberos" "hbase.master.kerberos.principal" = "hbase/_HOST@REALM" "hbase.regionserver.kerberos.principal" = "hbase/_HOST@REALM" "hbase.rpc.protection" = "authentication" "hbase.zookeeper.useSasl" = "false" } } } ``` ### Multiple Table ```hocon env { # You can set engine configuration here execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "hbase_sink_1" fields { name = STRING c_string = STRING c_double = DOUBLE c_bigint = BIGINT c_float = FLOAT c_int = INT c_smallint = SMALLINT c_boolean = BOOLEAN time = BIGINT } } rows = [ { kind = INSERT fields = ["label_1", "sink_1", 4.3, 200, 2.5, 2, 5, true, 1627529632356] } ] }, { schema = { table = "hbase_sink_2" fields { name = STRING c_string = STRING c_double = DOUBLE c_bigint = BIGINT c_float = FLOAT c_int = INT c_smallint = SMALLINT c_boolean = BOOLEAN time = BIGINT } } rows = [ { kind = INSERT fields = ["label_2", "sink_2", 4.3, 200, 2.5, 2, 5, true, 1627529632357] } ] } ] } } sink { Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "${table_name}" rowkey_column = ["name"] family_name { all_columns = info } } } ``` ## Writes To The Specified Column Family ```hocon Hbase { zookeeper_quorum = "hbase_e2e:2181" table = "assign_cf_table" rowkey_column = ["id"] family_name { c_double = "cf1" c_bigint = "cf2" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/HdfsFile.md ================================================ import ChangeLog from '../changelog/connector-file-hadoop.md'; # HdfsFile > HDFS File Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] compress codec - [x] lzo - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Description Output data to hdfs file ## Supported DataSource Info | Datasource | Supported Versions | |------------|--------------------| | HdfsFile | hadoop 2.x and 3.x | ## Sink Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | fs.defaultFS | string | yes | - | Hadoop cluster address. Supports the following formats:
- Standard HDFS: `hdfs://hadoopcluster` or `hdfs://namenode:9000`
- ViewFS (Federated HDFS): `viewfs://mycluster`
See ViewFS configuration example below. | | path | string | yes | - | The target dir path is required. | | tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a hdfs path. | | hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when `custom_filename` is `true`.`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when `custom_filename` is `true`.When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:[y:Year,M:Month,d:Day of month,H:Hour in day (0-23),m:Minute in hour,s:Second in minute] | | file_format_type | string | no | "csv" | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format is text and csv,The separator between columns in a row of data. Only needed by `text` file format. | | row_delimiter | string | no | "\n" | Only used when file_format is text,The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true,Partition data based on selected fields. | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true,If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. | | is_partition_field_write_in_file | boolean | no | false | Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.For example, if you want to write a Hive Data File, Its value should be `false`. | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns.Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. | | is_enable_transaction | boolean | no | true | If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.Only support `true` now. | | batch_size | int | no | 1000000 | The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. | | compress_codec | string | no | none | The compress codec of files and the details that supported as the following shown:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`].Tips: excel type does not support any compression format. | | krb5_path | string | no | /etc/krb5.conf | The krb5 path of kerberos | | kerberos_principal | string | no | - | The principal of kerberos | | kerberos_keytab_path | string | no | - | The keytab path of kerberos | | compress_codec | string | no | none | compress codec | | common-options | object | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | max_rows_in_memory | int | no | - | Only used when file_format is excel.When File Format is Excel,The maximum number of data items that can be cached in the memory. | | sheet_max_rows | int | no | 1048576 | Only used when file_format is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel.Writer the sheet of the workbook | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml, specifies the tag name of the root element within the XML file. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | remote_user | string | no | - | The remote user name of hdfs. | | schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method | | data_save_mode | string | no | APPEND_DATA | Existing data processing method | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### Tips > If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ### schema_save_mode [string] Existing dir processing method. - RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist - CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist - ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist - IGNORE :Ignore the treatment of the table ### data_save_mode [string] Existing data processing method. - DROP_DATA: preserve dir and delete data files - APPEND_DATA: preserve dir, preserve data files - ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Hdfs. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### For orc file format simple config ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } ``` ### For text file format with `have_partition` and `custom_filename` and `sink_columns` ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### For parquet file format with `have_partition` and `custom_filename` and `sink_columns` ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" file_format_type = "parquet" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### For kerberos simple config ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" hdfs_site_path = "/path/to/your/hdfs_site_path" kerberos_principal = "your_principal@EXAMPLE.COM" kerberos_keytab_path = "/path/to/your/keytab/file.keytab" } ``` ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### For compress simple config ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" compress_codec = "lzo" } ``` ### ViewFS (Federated HDFS) Configuration Example ViewFS allows you to unify multiple HDFS clusters or namespaces into a single logical namespace. This is very useful for HDFS Federation scenarios. ```hocon HdfsFile { fs.defaultFS = "viewfs://mycluster" path = "/data/output" file_format_type = "parquet" hdfs_site_path = "/path/to/core-site.xml" data_save_mode = "DROP_DATA" } ``` Configure mount table in `core-site.xml`: ```xml fs.viewfs.mounttable.mycluster.link./data hdfs://namenode1:9000/data fs.viewfs.mounttable.mycluster.link./logs hdfs://namenode2:9000/logs fs.viewfs.mounttable.mycluster.link./tmp hdfs://namenode3:9000/tmp ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Hive.md ================================================ import ChangeLog from '../changelog/connector-hive.md'; # Hive > Hive sink connector ## Description Write data to Hive. :::tip In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9 and 3.1.3 . If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir. ::: ## Key features - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] compress codec - [x] lzo ## Options | name | type | required | default value | |---------------------------------------|---------|----------|----------------| | table_name | string | yes | - | | metastore_uri | string | yes | - | | compress_codec | string | no | none | | hdfs_site_path | string | no | - | | hive_site_path | string | no | - | | hive.hadoop.conf | Map | no | - | | hive.hadoop.conf-path | string | no | - | | krb5_path | string | no | /etc/krb5.conf | | kerberos_principal | string | no | - | | kerberos_keytab_path | string | no | - | | abort_drop_partition_metadata | boolean | no | true | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | | overwrite | boolean | no | false | | data_save_mode | enum | no | APPEND_DATA | | schema_save_mode | enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | | save_mode_create_template | string | no | - | | common-options | | no | - | ### table_name [string] Target Hive table name eg: db1.table1, and if the source is multiple mode, you can use `${database_name}.${table_name}` to generate the table name, it will replace the `${database_name}` and `${table_name}` with the value of the CatalogTable generate from the source. ### metastore_uri [string] Hive metastore uri. Supports comma-separated multiple URIs for HA/failover (whitespace is ignored). SeaTunnel passes this value to Hive `hive.metastore.uris` and uses Hive `RetryingMetaStoreClient` (if available) to retry/failover between URIs. This is client-side endpoint failover; make sure your metastores share/replicate the same backend to keep metadata consistent. ### hdfs_site_path [string] The path of `hdfs-site.xml`, used to load ha configuration of namenodes ### hive_site_path [string] The path of `hive-site.xml` ### hive.hadoop.conf [map] Properties in hadoop conf('core-site.xml', 'hdfs-site.xml', 'hive-site.xml') ### hive.hadoop.conf-path [string] The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files ### krb5_path [string] The path of `krb5.conf`, used to authentication kerberos The path of `hive-site.xml`, used to authentication hive metastore ### kerberos_principal [string] The principal of kerberos ### kerberos_keytab_path [string] The keytab path of kerberos ### abort_drop_partition_metadata [boolean] Flag to decide whether to drop partition metadata from Hive Metastore during an abort operation. Note: this only affects the metadata in the metastore, the data in the partition will always be deleted(data generated during the synchronization process). ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### overwrite [boolean] Flag to decide whether to use overwrite mode when inserting data into Hive. If set to true, for non-partitioned tables, the existing data in the table will be deleted before inserting new data. For partitioned tables, the data in the relevant partition will be deleted before inserting new data. - Batch mode (BATCH): Delete existing data in the target path before commit (for non-partitioned tables, delete the table directory; for partitioned tables, delete the related partition directories), then write new data. - Streaming mode (STREAMING): In streaming jobs with checkpointing enabled, `commit()` is invoked after each completed checkpoint. To avoid deleting on every checkpoint (which would wipe previously committed files), SeaTunnel deletes each target directory (table directory / partition directory) at most once (empty commits will skip deletion). On recovery, the delete step is best-effort and may be skipped to avoid deleting already committed data, so streaming overwrite is not a strict snapshot overwrite. ### data_save_mode [enum] Select how to handle existing data on the target before writing new data. - APPEND_DATA (default): Keep existing data and append new records. - DROP_DATA: Behaves the same as overwrite=true. Before commit, delete the existing data in the target path (for non-partitioned tables, delete the table directory; for partitioned tables, delete the related partition directories), then write new data. - CUSTOM_PROCESSING / ERROR_WHEN_DATA_EXISTS: Currently not recommended for Hive sink unless you have specific requirements. Note: overwrite=true and data_save_mode=DROP_DATA are equivalent. Use either one; do not set both. ### schema_save_mode [enum] Before starting the synchronization task, different processing schemes are selected for the existing table structure on the target side. **Default value**: `CREATE_SCHEMA_WHEN_NOT_EXIST` Option values: - `RECREATE_SCHEMA`: Will create when the table does not exist, delete and rebuild when the table exists - `CREATE_SCHEMA_WHEN_NOT_EXIST`: Will create when the table does not exist, skip when the table exists - `ERROR_WHEN_SCHEMA_NOT_EXIST`: Error will be reported when the table does not exist - `IGNORE`: Ignore the treatment of the table ### save_mode_create_template [string] We use templates to automatically create Hive tables, which will create corresponding table creation statements based on the type of upstream data and schema type, and the default template can be modified according to the situation. Available template variables: ${database}, ${table}, ${rowtype_fields}, ${rowtype_partition_fields}, ${table_location}. **Default value**: When not specified, uses a default PARQUET non-partitioned table template: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_fields} ) STORED AS PARQUET LOCATION '${table_location}' ``` ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://namenode001:9083" } ``` Metastore URI failover example (multiple URIs): ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" } ``` ### example 1 We have a source table like this: ```bash create table test_hive_source( test_tinyint TINYINT, test_smallint SMALLINT, test_int INT, test_bigint BIGINT, test_boolean BOOLEAN, test_float FLOAT, test_double DOUBLE, test_string STRING, test_binary BINARY, test_timestamp TIMESTAMP, test_decimal DECIMAL(8,2), test_char CHAR(64), test_varchar VARCHAR(64), test_date DATE, test_array ARRAY, test_map MAP, test_struct STRUCT ) PARTITIONED BY (test_par1 STRING, test_par2 STRING); ``` We need read data from the source table and write to another table: ```bash create table test_hive_sink_text_simple( test_tinyint TINYINT, test_smallint SMALLINT, test_int INT, test_bigint BIGINT, test_boolean BOOLEAN, test_float FLOAT, test_double DOUBLE, test_string STRING, test_binary BINARY, test_timestamp TIMESTAMP, test_decimal DECIMAL(8,2), test_char CHAR(64), test_varchar VARCHAR(64), test_date DATE ) PARTITIONED BY (test_par1 STRING, test_par2 STRING); ``` The job config file can like this: ``` env { parallelism = 3 job.name="test_hive_source_to_hive" } source { Hive { table_name = "test_hive.test_hive_source" metastore_uri = "thrift://ctyun7:9083" } } sink { # choose stdout output plugin to output data to console Hive { table_name = "test_hive.test_hive_sink_text_simple" metastore_uri = "thrift://ctyun7:9083" hive.hadoop.conf = { bucket = "s3a://mybucket" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } ``` ### example2: Kerberos ```bash sink { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` Description: - `hive_site_path`: The path to the `hive-site.xml` file. - `kerberos_principal`: The principal for Kerberos authentication. - `kerberos_keytab_path`: The keytab file path for Kerberos authentication. - `krb5_path`: The path to the `krb5.conf` file used for Kerberos authentication. Run the case: ```bash env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` ## Hive on s3 ### Step 1 Create the lib dir for hive of emr. ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 2 Get the jars from maven center to the lib. ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### Step 3 Copy the jars from your environment on emr to the lib dir. ```shell cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 4 Run the case. ```shell env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_s3" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } } ``` ## Hive on oss ### Step 1 Create the lib dir for hive of emr. ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 2 Get the jars from maven center to the lib. ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### Step 3 Copy the jars from your environment on emr to the lib dir and delete the conflicting jar. ```shell cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar ``` ### Step 4 Run the case. ```shell env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_oss" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } ``` ### example 2 We have multiple source table like this: ```bash create table test_1( ) PARTITIONED BY (xx); create table test_2( ) PARTITIONED BY (xx); ... ``` We need read data from these source tables and write to another tables: The job config file can like this: ``` env { # You can set flink configuration here parallelism = 3 job.name="test_hive_source_to_hive" } source { Hive { tables_configs = [ { table_name = "test_hive.test_1" metastore_uri = "thrift://ctyun6:9083" }, { table_name = "test_hive.test_2" metastore_uri = "thrift://ctyun7:9083" } ] } } sink { # choose stdout output plugin to output data to console Hive { table_name = "${database_name}.${table_name}" metastore_uri = "thrift://ctyun7:9083" } } ``` ## Auto Table Creation Examples ### Example 1: Basic Auto Table Creation ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { id = bigint name = string department = string salary = decimal(10,2) hire_date = date } } rows = [ { kind = INSERT fields = [1, "John Doe", "Engineering", 75000.50, "2022-01-15"] } ] } } sink { Hive { table_name = "warehouse.employees" metastore_uri = "thrift://metastore:9083" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" save_mode_create_template = """ CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_fields} ) PARTITIONED BY ( department string COMMENT 'Department partition' ) STORED AS PARQUET LOCATION '${table_location}' TBLPROPERTIES ( 'seatunnel.creation.mode' = 'template' ) """ } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Http.md ================================================ import ChangeLog from '../changelog/connector-http.md'; # Http > Http sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to launch web hooks using data. > For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}` **Tips: Http sink only support `post json` webhook and the data from source will be treated as body content in web hook.** ## Supported DataSource Info In order to use the Http connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------| | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-http) | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url | | headers | Map | No | - | Http headers | | retry | Int | No | - | The max retry times if request http return to `IOException` | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | | socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | | array_mode | Boolean| No | false | Send data as a JSON array when true, or as a single JSON object when false (default) | | batch_size | Int | No | 1 | The batch size of records to send in one HTTP request. Only works when array_mode is true. | | request_interval_ms | Int | No | 0 | The interval milliseconds between two HTTP requests, to avoid sending requests too frequently. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Example simple: ```hocon Http { url = "http://localhost/test/webhook" headers { token = "9e32e859ef044462a257e1fc76730066" } } ``` ### With Batch Processing ```hocon Http { url = "http://localhost/test/webhook" headers { token = "9e32e859ef044462a257e1fc76730066" Content-Type = "application/json" } array_mode = true batch_size = 50 request_interval_ms = 500 } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Http { ... url = "http://localhost/test/${database_name}_test/${table_name}_test" } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Http { ... url = "http://localhost/test/${schema_name}_test/${table_name}_test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Hudi.md ================================================ import ChangeLog from '../changelog/connector-hudi.md'; # Hudi > Hudi sink connector ## Description Used to write data to Hudi. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Options Base configuration: | name | type | required | default value | |----------------------------|---------|----------|-----------------------------| | table_dfs_path | string | yes | - | | conf_files_path | string | no | - | | table_list | Array | no | - | | schema_save_mode | enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST| | common-options | Config | no | - | Table list configuration: | name | type | required | default value | |----------------------------|--------|----------|---------------| | table_name | string | yes | - | | database | string | no | default | | table_type | enum | no | COPY_ON_WRITE | | op_type | enum | no | insert | | record_key_fields | string | no | - | | partition_fields | string | no | - | | precombine_field | string | no | - | | batch_interval_ms | Int | no | 1000 | | batch_size | Int | no | 1000 | | insert_shuffle_parallelism | Int | no | 2 | | upsert_shuffle_parallelism | Int | no | 2 | | min_commits_to_keep | Int | no | 20 | | max_commits_to_keep | Int | no | 30 | | index_type | enum | no | BLOOM | | index_class_name | string | no | - | | record_byte_size | Int | no | 1024 | | cdc_enabled | boolean| no | false | Note: When this configuration corresponds to a single table, you can flatten the configuration items in table_list to the outer layer. ### table_name [string] `table_name` The name of hudi table. ### database [string] `database` The database of hudi table. ### table_dfs_path [string] `table_dfs_path` The dfs root path of hudi table, such as 'hdfs://nameserivce/data/hudi/'. ### table_type [enum] `table_type` The type of hudi table. The value is `COPY_ON_WRITE` or `MERGE_ON_READ`. ### record_key_fields [string] `record_key_fields` The record key fields of hudi table, its are used to generate record key. It must be configured when op_type is `UPSERT`. ### partition_fields [string] `partition_fields` The partition key fields of hudi table, its are used to generate partition. ### precombine_field [string] `precombine_field` The precombine field of hudi table, its are used in preCombining before actual write. ### index_type [string] `index_type` The index type of hudi table. Currently, `BLOOM`, `SIMPLE`, and `GLOBAL SIMPLE` are supported. ### index_class_name [string] `index_class_name` The customized index classpath of hudi table, example `org.apache.seatunnel.connectors.seatunnel.hudi.index.CustomHudiIndex`. ### record_byte_size [Int] `record_byte_size` The byte size of each record, This value can be used to help calculate the approximate number of records in each hudi data file. Adjusting this value can effectively reduce the number of hudi data file write magnifications. ### conf_files_path [string] `conf_files_path` The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'. ### op_type [enum] `op_type` The operation type of hudi table. The value is `insert` or `upsert` or `bulk_insert`. ### batch_interval_ms [Int] `batch_interval_ms` The interval time of batch write to hudi table. ### batch_size [Int] `batch_size` The size of batch write to hudi table. ### insert_shuffle_parallelism [Int] `insert_shuffle_parallelism` The parallelism of insert data to hudi table. ### upsert_shuffle_parallelism [Int] `upsert_shuffle_parallelism` The parallelism of upsert data to hudi table. ### min_commits_to_keep [Int] `min_commits_to_keep` The min commits to keep of hudi table. ### max_commits_to_keep [Int] `max_commits_to_keep` The max commits to keep of hudi table. ### cdc_enabled [boolean] `cdc_enabled` Whether to persist the CDC change log. When enable, persist the change data if necessary, and the table can be queried as a CDC query mode. ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details. ## Examples ### single table ```hocon sink { Hudi { table_dfs_path = "hdfs://nameserivce/data/" database = "st" table_name = "test_table" table_type = "COPY_ON_WRITE" conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" batch_size = 10000 use.kerberos = true kerberos.principal = "test_user@xxx" kerberos.principal.file = "/home/test/test_user.keytab" } } ``` ### Multiple table ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Hudi { table_dfs_path = "hdfs://nameserivce/data/" conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" table_list = [ { database = "st1" table_name = "role" table_type = "COPY_ON_WRITE" op_type="INSERT" batch_size = 10000 }, { database = "st1" table_name = "user" table_type = "COPY_ON_WRITE" op_type="UPSERT" # op_type is 'UPSERT', must configured record_key_fields record_key_fields = "user_id" batch_size = 10000 }, { database = "st1" table_name = "Bucket" table_type = "MERGE_ON_READ" } ] ... } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/HugeGraph.md ================================================ import ChangeLog from '../changelog/connector-hugegraph.md'; # HugeGraph Sink Connector `Sink: HugeGraph` ## Description The HugeGraph sink connector allows you to write data from SeaTunnel to Apache HugeGraph, a fast and scalable graph database. This connector supports writing data as vertices or edges, providing flexible mapping from relational data models to graph structures. It is designed for high-performance data loading. ## Features - **Batch Writing**: Data is written in batches for high throughput. - **Flexible Mapping**: Supports flexible mapping of source fields to vertex/edge properties. - **Vertex and Edge Writing**: Can write data as either vertices or edges. - **Automatic Schema Creation**: Can automatically create graph schema elements (property keys, vertex labels, edge labels) if they do not exist. ## Configuration Options | Name | Type | Required | Default Value | Description | | ------------------- | ------- | -------- | ------------- |--------------------------------------------------------------------------------| | `host` | String | Yes | - | The host of the HugeGraph server. | | `port` | Integer | Yes | - | The port of the HugeGraph server. | | `graph_name` | String | Yes | - | The name of the graph to write to. | | `graph_space` | String | Yes | - | The graph space of the graph to be operated on. | | `username` | String | No | - | The username for HugeGraph authentication. | | `password` | String | No | - | The password for HugeGraph authentication. | | `batch_size` | Integer | No | 500 | The number of records to buffer before writing to HugeGraph in a single batch. | | `batch_interval_ms` | Integer | No | 5000 | The maximum time in milliseconds to wait before flushing a batch. | | `max_retries` | Integer | No | 3 | The maximum number of times to retry a failed write operation. | | `retry_backoff_ms` | Integer | No | 5000 | The backoff time between retries in milliseconds. | ## Sink Options | Name | Type | Required | Default Value | Description | | ------------------ | ------ | -------- | ------------- |-----------------------------------------------------------------------------------------------------| | `schema_config` | Object | Yes | - | The configuration for mapping the input data to HugeGraph's schema (vertices or edges). | | `selected_fields` | List | No | - | A list of fields to be selected from the input data. If not specified, all fields will be used. | | `ignored_fields` | List | No | - | A list of fields to be ignored from the input data. Mutually exclusive with `selected_fields`. | ### Schema Configuration (`schema_config`) Each object in the `schema_config` list defines a mapping from the source data to a specific vertex or edge label in HugeGraph. | Name | Type | Required | Default Value | Description | | ------------------ |--------------------| ---------- | ------------- |----------------------------------------------------------------------------------------------------------| | `type` | String | Yes | - | The type of graph element to map to. Must be `VERTEX` or `EDGE`. | | `label` | String | Yes | - | The label of the vertex or edge in HugeGraph. | | `properties` | `List` | No | - | A list of source field names for the vertex or edge. | | `ttl` | Long | No | - | The time-to-live for the vertex or edge in seconds. | | `ttlStartTime` | String | No | - | The start time for the TTL. | | `enableLabelIndex` | Boolean | No | `false` | Whether to enable label index for this label. | | `userdata` | `Map` | No | - | User-defined data associated with the label. | | `idStrategy` | String | For Vertex | - | The ID generation strategy for vertices. Supported values: `PRIMARY_KEY`, `CUSTOMIZE_UUID`, `AUTOMATIC`. | | `idFields` | `List` | For Vertex | - | A list of source field names used to generate the vertex ID. | | `sourceConfig` | Object | For Edge | - | An object defining the mapping for the edge's source vertex. See `Source/Target Config` below. | | `targetConfig` | Object | For Edge | - | An object defining the mapping for the edge's target vertex. See `Source/Target Config` below. | | `frequency` | String | For Edge | - | The frequency of the edge, e.g., `SINGLE`, `MULTIPLE`. | | `mapping` | Object | No | - | An object defining advanced field and value mappings. See `Mapping Config` below. | ### Source/Target Config (`sourceConfig` and `targetConfig`) This object is used within an `EDGE` schema to define how to identify the source and target vertices. | Name | Type | Required | Default Value | Description | | ---------- | ------------ | -------- | ------------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------| | `label` | String | Yes | - | The label of the source or target vertex. | | `idFields` | `List` | Yes | - | A list of source field names from the input row used to construct the ID of the source/target vertex. The values will be concatenated to form the vertex ID. | ### Mapping Config (`mapping`) This object provides advanced control over how fields and values are mapped to properties. | Name | Type | Required | Default Value | Description | | ----------------- |---------------------|----------| ------------- |-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | `fieldMapping` | `Map` | No | - | A map where the key is the source field name and the value is the target property name in HugeGraph. If not specified, the source field name is used as the target property name. | | `valueMapping` | `Map` | No | - | A map to transform specific field values. The key is the original value from the source, and the value is the new value to be written. | | `nullableKeys` | `List` | No | - | A list of property keys that can have null values. | | `nullValues` | `List` | No | - | A list of string values that should be treated as `null`. Any field containing one of these values will not be written. | | `dateFormat` | String | No | `yyyy-MM-dd` | The date format for parsing date strings. | | `timeZone` | String | No | `GMT+8` | The time zone for date parsing. | | `sortKeys` | `List` | For Edge | - | A list of property keys to sort edges with the same source and target vertices. | ## Usage Examples ### 1. Writing Vertices This example shows how to read from a `FakeSource` and write `person` vertices to HugeGraph. The vertex ID is based on the `name` field. ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_input = "fake_source" schema = { fields = { name = "string" age = "int" } } } } sink { HugeGraph { host = "localhost" port = 8080 graph_name = "hugegraph" graph_space = "default" selected_fields = ["name", "age"] schema_config = { type = "VERTEX" label = "person" idStrategy = "PRIMARY_KEY" idFields = ["name"] properties = ["name", "age"] } } } ``` ### 2. Writing Edges This example syncs a relationship table to `knows` edges in HugeGraph. The source table contains the names of the two people who know each other and the year they met. ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_input = "fake_source" schema = { fields = { person1_name = "string" person2_name = "string" since = "int" } } } } sink { HugeGraph { host = "localhost" port = 8080 graph_name = "hugegraph" graph_space = "default" schema_config = { type = "EDGE" label = "knows" sourceConfig = { label = "person" idFields = ["person1_name"] } targetConfig = { label = "person" idFields = ["person2_name"] } properties = ["since"] mapping = { fieldMapping = { person1_name = "name" person2_name = "name" } } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Iceberg.md ================================================ import ChangeLog from '../changelog/connector-iceberg.md'; # Apache Iceberg > Apache Iceberg sink connector ## Support Iceberg Version - 1.6.1 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Sink connector for Apache Iceberg. It can support cdc mode 、auto create table and table schema evolution. ## Key features - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Supported DataSource Info | Datasource | Dependent | Maven | |------------|-----------|---------------------------------------------------------------------------| | Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## Database Dependency > In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. ## Data Type Mapping | SeaTunnel Data type | Iceberg Data type | |---------------------|-------------------| | BOOLEAN | BOOLEAN | | INT | INTEGER | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | STRING | | BYTES | FIXED
BINARY | | DECIMAL | DECIMAL | | ROW | STRUCT | | ARRAY | LIST | | MAP | MAP | ## Sink Options | Name | Type | Required | Default | Description | |----------------------------------------|---------|----------|------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | yes | default | User-specified catalog name. default is `default` | | namespace | string | yes | default | The iceberg database name in the backend catalog. default is `default` | | table | string | yes | - | The iceberg table name in the backend catalog. | | iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file: [CatalogProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java) | | hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | | iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | | case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | | iceberg.table.write-props | map | no | - | Properties passed through to Iceberg writer initialization, these take precedence, such as 'write.format.default', 'write.target-file-size-bytes', and other settings, can be found with specific parameters at [TableProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java). | | iceberg.table.auto-create-props | map | no | - | Configuration specified by Iceberg during automatic table creation. | | iceberg.table.schema-evolution-enabled | boolean | no | false | Setting to true enables Iceberg tables to support schema evolution during the synchronization process | | iceberg.table.primary-keys | string | no | - | Default comma-separated list of columns that identify a row in tables (primary key) | | iceberg.table.partition-keys | string | no | - | Default comma-separated list of partition fields to use when creating tables. Supports placeholder `${partition_keys}` for multi-table jobs | | iceberg.table.upsert-mode-enabled | boolean | no | false | Set to `true` to enable upsert mode, default is `false` | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | the schema save mode, please refer to `schema_save_mode` below | | data_save_mode | Enum | no | APPEND_DATA | the data save mode, please refer to `data_save_mode` below | | custom_sql | string | no | - | Custom `delete` data sql for data save mode. e.g: `delete from ... where ...` | | iceberg.table.commit-branch | string | no | - | Default branch for commits | ## Task Example ### Simple ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { plugin_output = "customers_mysql_cdc_iceberg" server-id = 5652 username = "st_user" password = "seatunnel" table-names = ["mysql_cdc.mysql_cdc_e2e_source_table"] url = "jdbc:mysql://mysql_cdc_e2e:3306/mysql_cdc" } } transform { } sink { Iceberg { catalog_name="seatunnel_test" iceberg.catalog.config={ "type"="hadoop" "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-sink/" } namespace="seatunnel_namespace" table="iceberg_sink_table" iceberg.table.write-props={ write.format.default="parquet" write.target-file-size-bytes=536870912 } iceberg.table.primary-keys="id" iceberg.table.partition-keys="f_datetime" iceberg.table.upsert-mode-enabled=true iceberg.table.schema-evolution-enabled=true case_sensitive=true } } ``` ### Hive Catalog ```hocon sink { Iceberg { catalog_name = "seatunnel_test" iceberg.catalog.config = { type = "hive" uri = "thrift://localhost:9083" warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" } namespace = "seatunnel_namespace" table = "iceberg_sink_table" iceberg.table.write-props = { write.format.default = "parquet" write.target-file-size-bytes = 536870912 } iceberg.table.primary-keys = "id" iceberg.table.partition-keys = "f_datetime" iceberg.table.upsert-mode-enabled = true iceberg.table.schema-evolution-enabled = true case_sensitive = true } } ``` ### Hadoop Catalog ```hocon sink { Iceberg { catalog_name = "seatunnel_test" iceberg.catalog.config = { type = "hadoop" warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" } namespace = "seatunnel_namespace" table = "iceberg_sink_table" iceberg.table.write-props = { write.format.default = "parquet" write.target-file-size-bytes = 536870912 } iceberg.table.primary-keys = "id" iceberg.table.partition-keys = "f_datetime" iceberg.table.upsert-mode-enabled = true iceberg.table.schema-evolution-enabled = true case_sensitive = true } } ``` ### Glue Catalog ```hocon sink { Iceberg { catalog_name = "seatunnel_test" iceberg.catalog.config = { warehouse = "s3://your-bucket/warehouse/" catalog-impl = "org.apache.iceberg.aws.glue.GlueCatalog" io-impl = "org.apache.iceberg.aws.s3.S3FileIO" client.region = "your-region" } namespace = "seatunnel_namespace" table = "iceberg_sink_table" iceberg.table.write-props = { write.format.default = "parquet" write.target-file-size-bytes = 536870912 } iceberg.table.primary-keys = "id" iceberg.table.partition-keys = "f_datetime" iceberg.table.upsert-mode-enabled = true iceberg.table.schema-evolution-enabled = true case_sensitive = true } } ``` ### AWS S3 Tables REST Catalog Amazon S3 Tables is a storage service for tabular data that's optimized for analytics workloads, with features designed to continuously improve query performance and reduce storage costs for tables. S3 Tables is purpose-built for storing tabular data, such as daily purchase transactions, streaming sensor data, or ad impressions. Tabular data represents data in columns and rows, like in a database table. You can connect an Iceberg REST client to the Amazon S3 Tables Iceberg REST endpoint and then make REST API calls to create, update, or query tables in S3 table buckets. The endpoint implements a standardized set of Iceberg REST APIs specified in the Apache Iceberg REST Catalog Open API specification. The endpoint works by translating Iceberg REST API operations to corresponding S3 Tables operations. Data in S3 Tables is stored in a new bucket type: table buckets, which store tables as subresources. Table buckets support storing tables in Apache Iceberg format. Using standard SQL statements, you can query tables through Iceberg-compatible query engines such as Amazon Athena, Amazon Redshift, and Apache Spark. ```hocon sink { Iceberg { catalog_name = "s3_tables_catalog" namespace = "s3_tables_catalog" table = "user_data" iceberg.catalog.config = { type: "rest" warehouse: "arn:aws:s3tables:::bucket/" uri: "https://s3tables..amazonaws.com/iceberg" rest.sigv4-enabled: "true" rest.signing-name: "s3tables" rest.signing-region: "" } } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Iceberg { ... namespace = "${database_name}_test" table = "${table_name}_test" } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Iceberg { ... namespace = "${schema_name}_test" table = "${table_name}_test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/InfluxDB.md ================================================ import ChangeLog from '../changelog/connector-influxdb.md'; # InfluxDB > InfluxDB sink connector ## Description Write data to InfluxDB. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|--------|----------|------------------------------| | url | string | yes | - | | database | string | yes | | | measurement | string | yes | | | username | string | no | - | | password | string | no | - | | key_time | string | no | processing time | | key_tags | array | no | exclude `field` & `key_time` | | batch_size | int | no | 1024 | | max_retries | int | no | - | | retry_backoff_multiplier_ms | int | no | - | | connect_timeout_ms | long | no | 15000 | | common-options | config | no | - | ### url the url to connect to influxDB e.g. ``` http://influxdb-host:8086 ``` ### database [string] The name of `influxDB` database ### measurement [string] The name of `influxDB` measurement ### username [string] `influxDB` user username ### password [string] `influxDB` user password ### key_time [string] Specify field-name of the `influxDB` measurement timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp ### key_tags [array] Specify field-name of the `influxDB` measurement tags in SeaTunnelRow. If not specified, include all fields with `influxDB` measurement field ### batch_size [int] For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `checkpoint.interval`, the data will be flushed into the influxDB ### max_retries [int] The number of retries to flush failed ### retry_backoff_multiplier_ms [int] Using as a multiplier for generating the next delay for backoff ### max_retry_backoff_ms [int] The amount of time to wait before attempting to retry a request to `influxDB` ### connect_timeout_ms [long] the timeout for connecting to InfluxDB, in milliseconds ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Examples ```hocon sink { InfluxDB { url = "http://influxdb-host:8086" database = "test" measurement = "sink" key_time = "time" key_tags = ["label"] batch_size = 1 } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { InfluxDB { url = "http://influxdb-host:8086" database = "test" measurement = "${table_name}_test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/IoTDB.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to write data to IoTDB. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) > IoTDB supports the `exactly-once` feature through idempotent writing. If multiple data have the same `key` and `timestamp`, the latest one will overwrite the previous one. ## Supported DataSource Info | Datasource | Supported Versions | Url | |------------|------------------------------|----------------| | IoTDB | `0.13.0 <= version <= 1.3.X` | localhost:6667 | ## Data Type Mapping | IotDB Data Type | SeaTunnel Data Type | |-----------------|---------------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|---------|----------|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| | node_urls | Array | Yes | - | IoTDB cluster address, the format is `["host1:port"]` or `["host1:port","host2:port"]` | | username | String | Yes | - | IoTDB user username | | password | String | Yes | - | IoTDB user password | | key_device | String | Yes | - | Specify field name of the IoTDB deviceId in SeaTunnelRow | | key_timestamp | String | No | processing time | Specify field-name of the IoTDB timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp | | key_measurement_fields | Array | No | exclude `device` & `timestamp` | Specify field-name of the IoTDB measurement list in SeaTunnelRow. If not specified, include all fields but exclude `device` & `timestamp` | | storage_group | Array | No | - | Specify device storage group(path prefix)
example: deviceId = \${storage_group} + "." + \${key_device} | | batch_size | Integer | No | 1024 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the IoTDB | | max_retries | Integer | No | - | The number of retries to flush failed | | retry_backoff_multiplier_ms | Integer | No | - | Using as a multiplier for generating the next delay for backoff | | max_retry_backoff_ms | Integer | No | - | The amount of time to wait before attempting to retry a request to `IoTDB` | | default_thrift_buffer_size | Integer | No | - | Thrift init buffer size in IoTDB client | | max_thrift_frame_size | Integer | No | - | Thrift max frame size in IoTDB client | | zone_id | string | No | - | java.time.ZoneId in IoTDB client | | enable_rpc_compression | Boolean | No | - | Enable rpc compression in IoTDB client | | connection_timeout_in_ms | Integer | No | - | The maximum time (in ms) to wait when connecting to IoTDB | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Examples ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 16 bigint.template = [1664035200001] schema = { fields { device_name = "string" temperature = "float" moisture = "int" event_ts = "bigint" c_string = "string" c_boolean = "boolean" c_tinyint = "tinyint" c_smallint = "smallint" c_int = "int" c_bigint = "bigint" c_float = "float" c_double = "double" } } } } ``` The data format from upstream SeaTunnelRow is as follows: | device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double | |--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------| | root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 | | root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 | | root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 | ### Case1 Only required options used: - use current processing time as timestamp - measurement fields include all fields excluding `key_device` ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` ### Case2 Use source event's time: - use `key_timestamp` as timestamp - measurement fields include all fields excluding `key_device` & `key_timestamp` ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field key_timestamp = "event_ts" # specify the `timestamp` use event_ts field } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` ### Case3 Use source event's time and limit measurement fields: - use `key_timestamp` as timestamp - measurement fields include only fields specified in `key_measurement_fields` ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" key_timestamp = "event_ts" key_measurement_fields = ["temperature", "moisture"] } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+ | Time| Device| temperature| moisture| +------------------------+------------------------+--------------+-----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| +------------------------+------------------------+--------------+-----------+ ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/IoTDBv2.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to write data to IoTDB. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) > IoTDB supports the `exactly-once` feature through idempotent writing. If multiple data have the same `key` and `timestamp`, the latest one will overwrite the previous one. ## Supported DataSource Info | Datasource | Supported Versions | Url | |------------|--------------------|----------------| | IoTDB | `2.0 <= version` | localhost:6667 | ## Data Type Mapping | SeaTunnel Data Type | IoTDB Data Type | |---------------------|-----------------| | BOOLEAN | BOOLEAN | | TINYINT | INT32 | | SMALLINT | INT32 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | TIMESTAMP | TIMESTAMP | | DATE | DATE | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|---------|----------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | node_urls | Array | Yes | - | IoTDB cluster address, the format is `["host1:port"]` or `["host1:port","host2:port"]` | | username | String | Yes | - | IoTDB username | | password | String | Yes | - | IoTDB user password | | sql_dialect | String | No | tree | the sql dialect of IoTDB, options available is `"tree"` or `"table"` | | storage_group | String | Yes | - | IoTDB-tree: Specify the device storage group(path prefix)
example: deviceId = \${storage_group} + "." + \${key_device}
IoTDB-table: Specify the database | | key_device | String | Yes | - | IoTDB-tree: Specify the field name in SeaTunnelRow to be used as device id
IoTDB-table: Specify the field name in SeaTunnelRow to be used as table name | | key_timestamp | String | No | processing time | IoTDB-tree: Specify the field name in SeaTunnelRow to be used as timestamp (processing time will be used by default)
IoTDB-table: Specify the field name in SeaTunnelRow to be used as time column (processing time will be used by default) | | key_measurement_fields | Array | No | refer to description | IoTDB-tree: Specify the field names in SeaTunnelRow to be used as measurement (all fields excluding `key_device`&`key_timestamp` will be used by default)
IoTDB-table: Specify the field names in SeaTunnelRow to be used as FIELD columns (all fields excluding `key_device`, `key_timestamp`, `key_tag_fields` and `key_attribute_fields` will be used by default) | | key_tag_fields | Array | No | - | IoTDB-tree: invalid
IoTDB-table: Specify the field names in SeaTunnelRow to be used as TAG columns | | key_attribute_fields | Array | No | - | IoTDB-tree: invalid
IoTDB-table: Specify the field names in SeaTunnelRow to be used as ATTRIBUTE columns | | batch_size | Integer | No | 1024 | In batch writing, the data will be flushed into the IoTDB either when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms` | | max_retries | Integer | No | - | The number of times retrying to flush | | retry_backoff_multiplier_ms | Integer | No | - | Used as a multiplier for generating the next delay for backoff | | max_retry_backoff_ms | Integer | No | - | The amount of time to wait before attempting to retry a request to IoTDB | | default_thrift_buffer_size | Integer | No | - | Thrift init buffer size in IoTDB client | | max_thrift_frame_size | Integer | No | - | Thrift max frame size in IoTDB client | | zone_id | String | No | - | java.time.ZoneId in IoTDB client | | enable_rpc_compression | Boolean | No | - | Enable rpc compression in IoTDB client, only valid in IoTDB-tree | | connection_timeout_in_ms | Integer | No | - | The maximum time (in ms) to wait when connecting to IoTDB | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Examples ### Example 1: Write data to IoTDB-tree ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 16 bigint.template = [1664035200001] schema = { fields { device_name = "string" temperature = "float" moisture = "int" event_ts = "bigint" c_string = "string" c_boolean = "boolean" c_tinyint = "tinyint" c_smallint = "smallint" c_int = "int" c_bigint = "bigint" c_float = "float" c_double = "double" } } } } ``` The data format from upstream SeaTunnelRow is as follows: | device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double | |--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------| | root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 | | root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 | | root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 | #### Case 1 Only required options used: - use current processing time as timestamp - measurement fields include all fields excluding `key_device` ```hocon sink { IoTDB { node_urls = "localhost:6667" username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` #### Case 2 Use source event's time: - use `key_timestamp` as timestamp - measurement fields include all fields excluding `key_device` & `key_timestamp` ```hocon sink { IoTDB { node_urls = "localhost:6667" username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field key_timestamp = "event_ts" # specify the `timestamp` use event_ts field } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` #### Case 3 Use source event's time and limit measurement fields: - use `key_timestamp` as timestamp - measurement fields include only fields specified in `key_measurement_fields` ```hocon sink { IoTDB { node_urls = "localhost:6667" username = "root" password = "root" key_device = "device_name" key_timestamp = "event_ts" key_measurement_fields = ["temperature", "moisture"] } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+ | Time| Device| temperature| moisture| +------------------------+------------------------+--------------+-----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| +------------------------+------------------------+--------------+-----------+ ``` ### Example 2: Write data into IoTDB-table ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { ... schema = { fields { ts = timestamp model_id = string region = string tag = string status = boolean arrival_date = date temperature = double } } } } ``` The data format from upstream SeaTunnelRow is as follows: | ts | model_id | region | tag | status | arrival_date | temperature | |-------------------------|----------|--------|------|--------|--------------|-------------| | 2025-07-30T17:52:34.851 | id1 | 0700HK | tag1 | true | 2024-11-12 | 4.34 | | 2025-07-29T17:51:34.851 | id2 | 0700HK | tag2 | false | 2024-12-01 | 5.54 | | 2025-07-28T17:50:34.851 | id3 | 0700HK | tag3 | false | 2024-12-22 | 7.34 | #### Case 1 Only required options used: - use current processing time as timestamp - FIELD columns include all fields excluding `key_device` ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+-----------------------+--------+----+------+------------+-----------+ | time| ts|model_id| tag|status|arrival_date|temperature| +-----------------------------+-----------------------+--------+----+------+------------+-----------+ |2025-08-14T17:52:34.851+08:00|2025-07-30T17:52:34.851| id1|tag1| true| 2024-11-12| 4.34| |2025-08-14T17:51:34.851+08:00|2025-07-29T17:51:34.851| id2|tag2| false| 2024-12-01| 5.54| |2025-08-14T17:50:34.851+08:00|2025-07-28T17:50:34.851| id3|tag3| false| 2024-12-22| 7.34| +-----------------------------+-----------------------+--------+----+------+------------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +------------+---------+--------+ | ColumnName| DataType|Category| +------------+---------+--------+ | time|TIMESTAMP| TIME| | ts|TIMESTAMP| FIELD| | model_id| STRING| FIELD| | tag| STRING| FIELD| | status| BOOLEAN| FIELD| |arrival_date| DATE| FIELD| | temperature| DOUBLE| FIELD| +------------+---------+--------+ ``` #### Case 2 Use source event's time and limit TAG and ATTRIBUTE columns: - use `key_timestamp` as time column - use specified fields as TAG columns and ATTRIBUTE columns - FIELD columns include all fields excluding `key_device`,`key_timestamp`,`key_tag_fields`and`key_attribute_fields` ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" key_timestamp = "ts" key_tag_fields = ["tag"] key_attribute_fields = ["model_id"] } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+----+--------+------+------------+-----------+ | time| tag|model_id|status|arrival_date|temperature| +-----------------------------+----+--------+------+------------+-----------+ |2025-07-30T17:52:34.851+08:00|tag1| id1| true| 2024-11-12| 4.34| |2025-07-29T17:51:34.851+08:00|tag2| id2| false| 2024-12-01| 5.54| |2025-07-28T17:50:34.851+08:00|tag3| id3| false| 2024-12-22| 7.34| +-----------------------------+----+--------+------+------------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +------------+---------+---------+ | ColumnName| DataType| Category| +------------+---------+---------+ | time|TIMESTAMP| TIME| | tag| STRING| TAG| | model_id| STRING|ATTRIBUTE| | status| BOOLEAN| FIELD| |arrival_date| DATE| FIELD| | temperature| DOUBLE| FIELD| +------------+---------+---------+ ``` #### Case 3 Use source event's time and limit FIELD columns: - use `key_timestamp` as time column - use specified fields as FIELD columns ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" key_timestamp = "ts" key_measurement_fields = ["status", "temperature"] } } ``` The data format of IoTDB output is as follows: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+------+-----------+ | time|status|temperature| +-----------------------------+------+-----------+ |2025-07-30T17:52:34.851+08:00| true| 4.34| |2025-07-29T17:51:34.851+08:00| false| 5.54| |2025-07-28T17:50:34.851+08:00| false| 7.34| +-----------------------------+------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +-----------+---------+--------+ | ColumnName| DataType|Category| +-----------+---------+--------+ | time|TIMESTAMP| TIME| | status| BOOLEAN| FIELD| |temperature| DOUBLE| FIELD| +-----------+---------+-------+ ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Jdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # JDBC > JDBC sink connector ## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the jdbc driver jar package has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the jdbc driver jar package has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is support `Xa transactions`. You can set `is_exactly_once=true` to enable it. - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default | |-------------------------------------------|---------|----------|------------------------------| | url | String | Yes | - | | driver | String | Yes | - | | username | String | No | - | | password | String | No | - | | query | String | No | - | | compatible_mode | String | No | - | | dialect | String | No | - | | database | String | No | - | | table | String | No | - | | primary_keys | Array | No | - | | connection_check_timeout_sec | Int | No | 30 | | max_retries | Int | No | 0 | | batch_size | Int | No | 1000 | | is_exactly_once | Boolean | No | false | | generate_sink_sql | Boolean | No | false | | xa_data_source_class_name | String | No | - | | max_commit_attempts | Int | No | 3 | | transaction_timeout_sec | Int | No | -1 | | auto_commit | Boolean | No | true | | field_ide | String | No | - | | properties | Map | No | - | | common-options | | No | - | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | Enum | No | APPEND_DATA | | custom_sql | String | No | - | | enable_upsert | Boolean | No | true | | use_copy_statement | Boolean | No | false | | create_index | Boolean | No | true | | access_key_id | String | No | | | secret_access_key | String | No | | | region | String | No | | ### driver [string] The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. ### user [string] userName ### password [string] password ### url [string] The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/test ### query [string] Use this sql write upstream input datas to database. e.g `INSERT ...` ### compatible_mode [string] The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'. when using StarRocks, you need set it to `starrocks`. Postgres 9.5 version or below,please set it to `postgresLow` to support cdc ### dialect [string] The appointed dialect, if it does not exist, is still obtained according to the url, and the priority is higher than the url. For example,when using starrocks, you need set it to `starrocks`. Similarly, when using mysql, you need to set its value to `mysql`. If one dialect not supported by SeaTunnel, it will use the default dialect `GenericDialect`. Just make sure the driver you provided support the database you want to connect. #### dialect list | | Dialect Name | | |-----------|--------------|----------| | Greenplum | DB2 | Dameng | | Gbase8a | HIVE | KingBase | | MySQL | StarRocks | Oracle | | Phoenix | Postgres | Redshift | | SapHana | Snowflake | Sqlite | | SqlServer | Tablestore | Teradata | | Vertica | OceanBase | XUGU | | IRIS | Inceptor | Highgo | | DSQL | | | ### database [string] Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. ### table [string] Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. mysql sink for example: 1. test_${schema_name}_${table_name}_test 2. sink_sinktable 3. ss_${table_name} pgsql (Oracle Sqlserver ...) Sink for example: 1. ${schema_name}.${table_name}_test 2. dbo.tt_${table_name}_sink 3. public.sink_table Tip: If the target database has the concept of SCHEMA, the table parameter must be written as `xxx.xxx` ### primary_keys [array] This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. ### connection_check_timeout_sec [int] The time in seconds to wait for the database operation used to validate the connection to complete. ### max_retries [int] The number of retries to submit failed (executeBatch) ### batch_size [int] For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database ### is_exactly_once [boolean] Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. ### generate_sink_sql [boolean] Generate sql statements based on the database table you want to write to ### xa_data_source_class_name [string] The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and please refer to appendix for other data sources ### max_commit_attempts [int] The number of retries for transaction commit failures ### transaction_timeout_sec [int] The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics ### auto_commit [boolean] Automatic transaction commit is enabled by default ### field_ide [String] The field "field_ide" is used to identify whether the field needs to be converted to uppercase or lowercase when synchronizing from the source to the sink. "ORIGINAL" indicates no conversion is needed, "UPPERCASE" indicates conversion to uppercase, and "LOWERCASE" indicates conversion to lowercase. ### properties Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `CUSTOM_PROCESSING`:User defined processing `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ### custom_sql [String] When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. ### enable_upsert [boolean] Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import ### use_copy_statement [boolean] Use `COPY ${table} FROM STDIN` statement to import data. Only drivers with `getCopyAPI()` method connections are supported. e.g.: Postgresql driver `org.postgresql.Driver`. NOTICE: `MAP`, `ARRAY`, `ROW` types are not supported. ### create_index [boolean] Create the index(contains primary key and any other indexes) or not when auto-create table. You can use this option to improve the performance of jdbc writes when migrating large tables. Notice: Note that this will sacrifice read performance, so you'll need to manually create indexes after the table migration to improve read performance ### access_key_id [String] The access_key_id in AWS authentication. Only valid for dialect="dsql" ### secret_access_key [String] The secret_access_key in AWS authentication. Only valid for dialect="dsql" ### region [String] The area where Amazon Aurora DSQL is located. Only valid for dialect="dsql" ## tips In the case of is_exactly_once = "true", Xa transactions are used. This requires database support, and some databases require some setup : 1 postgres needs to set `max_prepared_transactions > 1` such as `ALTER SYSTEM set max_prepared_transactions to 10`. 2 mysql version need >= `8.0.29` and Non-root users need to grant `XA_RECOVER_ADMIN` permissions. such as `grant XA_RECOVER_ADMIN on test_db.* to 'user1'@'%'`. 3 mysql can try to add `rewriteBatchedStatements=true` parameter in url for better performance. ## appendix there are some reference value for params above. | datasource | driver | url | xa_data_source_class_name | maven | |-------------------|----------------------------------------------|---------------------------------------------------------------------|----------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| | MySQL | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | com.mysql.cj.jdbc.MysqlXADataSource | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | PostgreSQL | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | | DM | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | dm.jdbc.driver.DmdbXADataSource | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | | Phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | / | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | | SQL Server | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | com.microsoft.sqlserver.jdbc.SQLServerXADataSource | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | | Oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | oracle.jdbc.xa.OracleXADataSource | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | | sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | / | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | | GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar | | StarRocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | com.ibm.db2.jcc.DB2XADataSource | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | | saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | / | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | | Doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | / | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | | Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | | Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | | Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | | Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | | OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar | | xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | / | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar | | InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | / | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar | | opengauss | org.opengauss.Driver | jdbc:opengauss://localhost:5432/postgres | / | https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/5.1.0-og/opengauss-jdbc-5.1.0-og.jar | | Highgo | com.highgo.jdbc.Driver | jdbc:highgo://localhost:5866/highgo | / | https://repo1.maven.org/maven2/com/highgo/HgdbJdbc/6.2.3/HgdbJdbc-6.2.3.jar | | Dsql | org.postgresql.Driver | jdbc:postgresql://Amazon Aurora DSQL Cluster Endpoint:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | ## Example Simple ``` jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } ``` Exactly-once Turn on exact one-time semantics by setting `is_exactly_once` ``` jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" max_retries = 0 user = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } ``` CDC(Change data capture) event jdbc receive CDC example ``` sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "sink_database" table = "sink_table" primary_keys = ["key1", "key2", ...] } } ``` Add saveMode function To facilitate the creation of tables when they do not already exist, set the `schema_save_mode` to `CREATE_SCHEMA_WHEN_NOT_EXIST`. ``` sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" generate_sink_sql = "true" database = "sink_database" table = "sink_table" primary_keys = ["key1", "key2", ...] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` Postgresql 9.5 version below support CDC(Change data capture) event For PostgreSQL versions 9.5 and below, setting `compatible_mode` to `postgresLow` to enable support for PostgreSQL Change Data Capture (CDC) operations. ``` sink { jdbc { url = "jdbc:postgresql://localhost:5432" driver = "org.postgresql.Driver" user = "root" password = "123456" compatible_mode="postgresLow" database = "sink_database" table = "sink_table" generate_sink_sql = true primary_keys = ["key1", "key2", ...] } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" generate_sink_sql = true database = "${database_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" generate_sink_sql = true database = "${schema_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` #### Dsql example ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Jdbc { dialect="Dsql" driver = "org.postgresql.Driver" url="jdbc:postgresql://ixxxxxxxxxxxxx.dsql.us-east-1.on.aws:5432/postgres" username = "admin" access_key_id = "ACCESSKEYIDEXAMPLE" secret_access_key = "SECRETACCESSKEYEXAMPLE" region = "us-east-1" database = "postgres" generate_sink_sql = true primary_keys = ["id"] max_retries = 3 batch_size = 1000 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Kafka.md ================================================ import ChangeLog from '../changelog/connector-kafka.md'; # Kafka > Kafka sink connector ## Support Those Engines > Spark
> Flink
> Seatunnel Zeta
## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > By default, we will use 2pc to guarantee the message is sent to kafka exactly once. ## Description Write Rows to a Kafka topic. ## Supported DataSource Info In order to use the Kafka connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Maven | |------------|--------------------|-------------------------------------------------------------------------------------| | Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-kafka) | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | String | Yes | - | When the table is used as sink, the topic name is the topic to write data to. | | bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | | kafka.config | Map | No | - | In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs). | | semantics | String | No | NON | Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON. | | partition_key_fields | Array | No | - | Configure which fields are used as the key of the kafka message. | | kafka_headers_fields | Array | No | - | Configure which fields are used as the headers of the kafka message. The field value will be converted to a string and used as the header value. | | partition | Int | No | - | We can specify the partition, all messages will be sent to this partition. | | assign_partitions | Array | No | - | We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information. | | transaction_prefix | String | No | - | If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction,kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix. | | format | String | No | json | Data format. The default format is json. Optional text format, canal_json, debezium_json, ogg_json , avro and native.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | | field_delimiter | String | No | , | Customize the field delimiter for data format. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details | | protobuf_message_name | String | No | - | Effective when the format is set to protobuf, specifies the Message name | | protobuf_schema | String | No | - | Effective when the format is set to protobuf, specifies the Schema definition | ## Parameter Interpretation ### Topic Formats Currently two formats are supported: 1. Fill in the name of the topic. 2. Use value of a field from upstream data as topic,the format is `${your field name}`, where topic is the value of one of the columns of the upstream data. For example, Upstream data is the following: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | If `${name}` is set as the topic. So the first row is sent to Jack topic, and the second row is sent to Mary topic. ### Semantics In EXACTLY_ONCE, producer will write all messages in a Kafka transaction that will be committed to Kafka on a checkpoint. In AT_LEAST_ONCE, producer will wait for all outstanding messages in the Kafka buffers to be acknowledged by the Kafka producer on a checkpoint. NON does not provide any guarantees: messages may be lost in case of issues on the Kafka broker and messages may be duplicated. ### Partition Key Fields For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. Upstream data is the following: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. If not set partition key fields, the null message key will be sent to. The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'. The selected field must be an existing field in the upstream. ### Kafka Headers Fields For example, if you want to use value of fields from upstream data as kafka message headers, you can assign field names to this property. Upstream data is the following: | name | age | data | source | traceId | |------|-----|---------------|--------|-----------| | Jack | 16 | data-example1 | web | trace-123 | | Mary | 23 | data-example2 | mobile | trace-456 | If source and traceId are set as the kafka headers fields, then these field values will be added as headers to the kafka message. For example, the first row will have headers: `source=web` and `traceId=trace-123`. The field values will be converted to strings and used as header values. The selected fields must be existing fields in the upstream. Note: Fields configured as Kafka headers will be excluded from the message value (payload) and will only be present in the Kafka message headers. ### Assign Partitions For example, there are five partitions in total, and the assign_partitions field in config is as follows: assign_partitions = ["shoe", "clothing"] Then the message containing "shoe" will be sent to partition zero ,because "shoe" is subscribed as zero in assign_partitions, and the message containing "clothing" will be sent to partition one.For other messages, the hash algorithm will be used to divide them into the remaining partitions. This function by `MessageContentPartitioner` class implements `org.apache.kafka.clients.producer.Partitioner` interface.If we need custom partitions, we need to implement this interface as well. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Kafka Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ```hocon # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { kafka { topic = "test_topic" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } } } ``` ### Using Kafka Headers This example shows how to use kafka_headers_fields to set Kafka message headers: ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" source = "string" traceId = "string" } } } } sink { kafka { topic = "test_topic" bootstrap.servers = "localhost:9092" format = json partition_key_fields = ["name"] kafka_headers_fields = ["source", "traceId"] kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } } } ``` ### AWS MSK SASL/SCRAM Replace the following `${username}` and `${password}` with the configuration values in AWS MSK. ```hocon sink { kafka { topic = "seatunnel" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { security.protocol=SASL_SSL sasl.mechanism=SCRAM-SHA-512 sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" } } } ``` ### AWS MSK IAM Download `aws-msk-iam-auth-1.1.5.jar` from https://github.com/aws/aws-msk-iam-auth/releases and put it in `$SEATUNNEL_HOME/plugin/kafka/lib` dir. Please ensure the IAM policy have `"kafka-cluster:Connect",`. Like this: ```hocon "Effect": "Allow", "Action": [ "kafka-cluster:Connect", "kafka-cluster:AlterCluster", "kafka-cluster:DescribeCluster" ], ``` Sink Config ```hocon sink { kafka { topic = "seatunnel" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { security.protocol=SASL_SSL sasl.mechanism=AWS_MSK_IAM sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } } ``` ### Kerberos Authentication Example Please set JVM parameters `java.security.krb5.conf` before starting the SeaTunnel or update default `krb5.conf` in `/etc/krb5.conf`. Sink Config ``` sink { Kafka { topic = "seatunnel" bootstrap.servers = "127.0.0.1:9092" format = json semantics = EXACTLY_ONCE kafka.config = { security.protocol=SASL_PLAINTEXT sasl.kerberos.service.name=kafka sasl.mechanism=GSSAPI sasl.jaas.config="com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" } } } ``` ### Protobuf Configuration Set the `format` to `protobuf` and configure the `protobuf` data structure using the `protobuf_message_name` and `protobuf_schema` parameters. Example Usage: ```hocon sink { kafka { topic = "test_protobuf_topic_fake_source" bootstrap.servers = "kafkaCluster:9092" format = protobuf kafka.request.timeout.ms = 60000 kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ } } ``` ### format If you need to write Kafka's native information, you can refer to the following configuration. Config Example: ```hocon sink { kafka { topic = "test_topic_native_sink" bootstrap.servers = "kafkaCluster:9092" format = "NATIVE" } } ``` The input parameter requirements are as follows: ```json { "headers": { "header1": "header1", "header2": "header2" }, "key": "dGVzdF9ieXRlc19kYXRh", "partition": 3, "timestamp": 1672531200000, "timestampType": "CREATE_TIME", "value": "dGVzdF9ieXRlc19kYXRh" } ``` Note:key/value is of type byte[]. ## Changelog ================================================ FILE: docs/en/connectors/sink/Kingbase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Kingbase > JDBC Kingbase Sink Connector ## Support Connector Version - 8.6 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it.Kingbase currently does not support ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------| | Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' > working directory
> For example: cp kingbase8-8.6.0.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Kingbase Data Type | SeaTunnel Data Type | |----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL | BOOLEAN | | INT2 | SHORT | | SMALLSERIAL
SERIAL
INT4 | INT | | INT8
BIGSERIAL | BIGINT | | FLOAT4 | FLOAT | | FLOAT8 | DOUBLE | | NUMERIC | DECIMAL((Get the designated column's specified column size),
(Gets the designated column's number of digits to right of the decimal point.))) | | BPCHAR
CHARACTER
VARCHAR
TEXT | STRING | | TIMESTAMP | LOCALDATETIME | | TIME | LOCALTIME | | DATE | LOCALDATE | | Other data type | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. Kingbase currently does not support | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver,Kingbase currently does not support | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed > in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends > it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having 12 fields. The final target table is test_table will also be 16 rows of data in the table. > Before > run this job, you need create database test and table test_table in your Kingbase. And if you have not yet installed and > deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) > to > install and deploy SeaTunnel. And then follow the instructions > in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = time c_timestamp = timestamp } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:kingbase8://127.0.0.1:54321/dbname" driver = "com.kingbase8.Driver" username = "root" password = "123456" query = "insert into test_table(c_string,c_boolean,c_tinyint,c_smallint,c_int,c_bigint,c_float,c_double,c_decimal,c_date,c_time,c_timestamp) values(?,?,?,?,?,?,?,?,?,?,?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically > generate add statements for you ``` sink { jdbc { url = "jdbc:kingbase8://127.0.0.1:54321/dbname" driver = "com.kingbase8.Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Kudu.md ================================================ import ChangeLog from '../changelog/connector-kudu.md'; # Kudu > Kudu sink connector ## Support Kudu Version - 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Data Type Mapping | SeaTunnel Data Type | Kudu Data Type | |---------------------|--------------------------| | BOOLEAN | BOOL | | INT | INT8
INT16
INT32 | | BIGINT | INT64 | | DECIMAL | DECIMAL | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | TIMESTAMP | UNIXTIME_MICROS | | BYTES | BINARY | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|--------|----------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| | kudu_masters | String | Yes | - | Kudu master address. Separated by ',',such as '192.168.88.110:7051'. | | table_name | String | Yes | - | The name of kudu table. | | client_worker_count | Int | No | 2 * Runtime.getRuntime().availableProcessors() | Kudu worker count. Default value is twice the current number of cpu cores. | | client_default_operation_timeout_ms | Long | No | 30000 | Kudu normal operation time out. | | client_default_admin_operation_timeout_ms | Long | No | 30000 | Kudu admin operation time out. | | enable_kerberos | Bool | No | false | Kerberos principal enable. | | kerberos_principal | String | No | - | Kerberos principal. Note that all zeta nodes require have this file. | | kerberos_keytab | String | No | - | Kerberos keytab. Note that all zeta nodes require have this file. | | kerberos_krb5conf | String | No | - | Kerberos krb5 conf. Note that all zeta nodes require have this file. | | save_mode | String | No | - | Storage mode, support `overwrite` and `append`. | | session_flush_mode | String | No | AUTO_FLUSH_SYNC | Kudu flush mode. Default AUTO_FLUSH_SYNC. | | batch_size | Int | No | 1024 | The flush max size (includes all append, upsert and delete records), over this number of records, will flush data. The default value is 100 | | buffer_flush_interval | Int | No | 10000 | The flush interval mills, over this time, asynchronous threads will flush data. | | ignore_not_found | Bool | No | false | If true, ignore all not found rows. | | ignore_not_duplicate | Bool | No | false | If true, ignore all dulicate rows. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details. | ## Task Example ### Simple > The following example refers to a FakeSource named "kudu" cdc write kudu table "kudu_sink_table" ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "kudu" schema = { fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [3, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = UPDATE_BEFORE fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = UPDATE_AFTER fields = [1, true, 2, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = DELETE fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] } } sink { kudu{ plugin_input = "kudu" kudu_masters = "kudu-master-cdc:7051" table_name = "kudu_sink_table" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { kudu{ kudu_masters = "kudu-master-cdc:7051" table_name = "${database_name}_${table_name}_test" } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { kudu{ kudu_masters = "kudu-master-cdc:7051" table_name = "${schema_name}_${table_name}_test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Lance.md ================================================ import ChangeLog from '../changelog/connector-lance.md'; # Lance > Lance sink connector ## Support Those Engines > Spark(not support version under spark 3.4, reference https://lance.org/integrations/spark/install/#scala)
> Flink(not support, reference https://github.com/lance-format/lance-flink)
> SeaTunnel Zeta
## Description Sink connector for Lance format. It can support create and write dataset 、lance namespace manage schema and version. ## Key features - [] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Using Dependency com.lancedb lance-core 0.33.0 com.lancedb lance-namespace-core 0.0.14 ## Sink Options | Name | Type | Required | Default | Description | |-----------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------| | dataset_path | string | yes | /tmp | The dataset path for the Lance sink connection. | | namespace_type | string | yes | dir | The namespace type of Lance dataset, now only support DirectoryNamespace, the type will be set default with "dir" | | table | string | yes | test | The name of Lance dataset, If not set, the dataset name will be set default with test | | namespace_id | string | no | - | The id of the lance namespace. Please refer to https://lance.org/format/namespace/ | ## Data Type Mapping The data type of lance depends on the Arrow data type system | SeaTunnel Data type | Lance Data type | |---------------------|-----------------| | BOOLEAN | bool/boolean | | TINYINT | int8 | | SMALLINT | int16 | | INT | int32 | | BIGINT | int64 | | FLOAT | float16 | | DOUBLE | float32 | | BYTES | binary | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | string/utf8 | ## Task Example ### Simple ```hocon env { parallelism = 1 job.mode = "BATCH" # You can set spark configuration here spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local } source { FakeSource { row.num = 100 schema = { fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } plugin_output = "fake" } } transform { } sink { Lance { dataset_path = "/tmp/seatunnel_mnt/lanceTest/lance_sink_table" namespace_type = "dir" namespace_id = "root" table = "lance_sink_table" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/LocalFile.md ================================================ import ChangeLog from '../changelog/connector-file-local.md'; # LocalFile > Local file sink connector ## Description Output data to local file. :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ::: ## Key Features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | | | tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format_type is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format_type is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format_type is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method | | data_save_mode | string | no | APPEND_DATA | Existing data processing method | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### path [string] The target dir path is required, you can inject the upstream CatalogTable into the path by using: `${database_name}`, `${table_name}` and `${schema_name}`. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `json` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### schema_save_mode [string] Existing dir processing method. - RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist - CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist - ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist - IGNORE :Ignore the treatment of the table ### data_save_mode [string] Existing data processing method. - DROP_DATA: preserve dir and delete data files - APPEND_DATA: preserve dir, preserve data files - ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Example For orc file format simple config ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } ``` For json, text, csv or xml file format with `encoding` ```hocon LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" encoding = "gbk" } ``` For parquet file format with `sink_columns` ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "parquet" sink_columns = ["name","age"] } ``` For text file format with `have_partition` and `custom_filename` and `sink_columns` ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` For excel file format with `sheet_name` and `max_rows_in_memory` ```bash LocalFile { path="/tmp/seatunnel/excel" sheet_name = "Sheet1" max_rows_in_memory = 1024 partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type="excel" filename_time_format="yyyy.MM.dd" is_enable_transaction=true schema_save_mode=RECREATE_SCHEMA data_save_mode=DROP_DATA } ``` For extract source metadata from upstream, you can use `${database_name}`, `${table_name}` and `${schema_name}` in the path. ```bash LocalFile { path = "/tmp/hive/warehouse/${table_name}" file_format_type = "parquet" sink_columns = ["name","age"] } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Maxcompute.md ================================================ import ChangeLog from '../changelog/connector-maxcompute.md'; # Maxcompute > Maxcompute sink connector ## Description Used to read data from Maxcompute. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------|---------|----------|---------------| | accessId | string | yes | - | | accesskey | string | yes | - | | endpoint | string | yes | - | | project | string | yes | - | | table_name | string | yes | - | | partition_spec | string | no | - | | overwrite | boolean | no | false | | insert_strategy| string | no | upload | | common-options | string | no | | ### accessId [string] `accessId` Your Maxcompute accessId which cloud be access from Alibaba Cloud. ### accesskey [string] `accesskey` Your Maxcompute accessKey which cloud be access from Alibaba Cloud. ### endpoint [string] `endpoint` Your Maxcompute endpoint start with http. ### project [string] `project` Your Maxcompute project which is created in Alibaba Cloud. ### table_name [string] `table_name` Target Maxcompute table name eg: fake. ### partition_spec [string] `partition_spec` This spec of Maxcompute partition table eg:ds='20220101'. ### overwrite [boolean] `overwrite` Whether to overwrite the table or partition, default: false. ### save_mode_create_template We use templates to automatically create MaxCompute tables, which will create corresponding table creation statements based on the type of upstream data and schema type, and the default template can be modified according to the situation. Only work on multi-table mode at now. Default template: ```sql CREATE TABLE IF NOT EXISTS `${table}` ( ${rowtype_fields} ) COMMENT '${comment}'; ``` If a custom field is filled in the template, such as adding an `id` field ```sql CREATE TABLE IF NOT EXISTS `${table}` ( id, ${rowtype_fields} ) COMMENT '${comment}'; ``` The connector will automatically obtain the corresponding type from the upstream to complete the filling, and remove the id field from `rowtype_fields`. This method can be used to customize the modification of field types and attributes. You can use the following placeholders - database: Used to get the database in the upstream schema - table_name: Used to get the table name in the upstream schema - rowtype_fields: Used to get all the fields in the upstream schema, we will automatically map to the field description of MaxCompute - rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) - rowtype_unique_key: Used to get the unique key in the upstream schema (maybe a list) - comment: Used to get the table comment in the upstream schema ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved. If the `partition_spec` is set, the partition will be deleted and rebuilt. `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved. If the `partition_spec` is set, the partition will be created. `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `CUSTOM_PROCESSING`:User defined processing `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ### custom_sql [String] When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. ### datetime_format [String] User-defined format string used to convert LocalDateTime fields to strings. Use this option when you want to specify a custom datetime format that matches one of the predefined values in DateTimeUtils.Formatter (e.g. yyyy-MM-dd HH:mm:ss, yyyyMMddHHmmss, etc.). Example values: - `yyyy-MM-dd HH:mm:ss` - `yyyy-MM-dd HH:mm:ss.SSSSSS` - `yyyy.MM.dd HH:mm:ss` - `yyyy/MM/dd HH:mm:ss` - `yyyy/M/d HH:mm` - `yyyy-M-d HH:mm` - `yyyy/M/d HH:mm:ss` - `yyyy-M-d HH:mm:ss` - `yyyyMMddHHmmss` Default: `yyyy-MM-dd HH:mm:ss` ### tunnel_endpoint [String] Specifies the custom endpoint URL for the MaxCompute Tunnel service. By default, the endpoint is automatically inferred from the configured region. This option allows you to override the default behavior and use a custom Tunnel endpoint. If not specified, the connector will use the region-based default Tunnel endpoint. In general, you do **not** need to set tunnel_endpoint. It is only needed for custom networking, debugging, or local development. Example values: - `https://dt.cn-hangzhou.maxcompute.aliyun.com` - `https://dt.ap-southeast-1.maxcompute.aliyun.com` - `http://maxcompute:8080` Default: Not set (auto-inferred from region) ### insert_strategy [string] If `insert_strategy` is set to `upload`, insert operations use an upload session. If set to `upsert`, insert operations use an upsert session. Upsert sessions require a primary key. **Note**: Using upload sessions for insert operations alongside update or delete operations may cause insert records to appear in the table later than expected. When a primary key is present, it is recommended to set `insert_strategy` to `upsert` to ensure consistent upsert behavior. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Examples ```hocon sink { Maxcompute { accessId="" accesskey="" endpoint="" project="" table_name="" #partition_spec="" #overwrite = false } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Milvus.md ================================================ import ChangeLog from '../changelog/connector-milvus.md'; # Milvus > Milvus sink connector ## Description This Milvus sink connector write data to Milvus or Zilliz Cloud, it has the following features: - support read and write data by partition - support write dynamic schema data from Metadata Column - json data will be converted to json string and sink as json as well - retry automatically to bypass ratelimit and grpc limit ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) ## Data Type Mapping | Milvus Data Type | SeaTunnel Data Type | |---------------------|---------------------| | INT8 | TINYINT | | INT16 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOL | BOOLEAN | | JSON | STRING | | ARRAY | ARRAY | | VARCHAR | STRING | | FLOAT_VECTOR | FLOAT_VECTOR | | BINARY_VECTOR | BINARY_VECTOR | | FLOAT16_VECTOR | FLOAT16_VECTOR | | BFLOAT16_VECTOR | BFLOAT16_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | ## Sink Options | Name | Type | Required | Default | Description | |------------------------|---------------------|----------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL to connect to Milvus or Zilliz Cloud. | | token | String | Yes | - | User:password | | database | String | No | - | Write data to which database, default is source database. | | schema_save_mode | enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Auto create table when table not exist. | | enable_auto_id | boolean | No | false | Primary key column enable autoId. | | enable_upsert | boolean | No | false | Upsert data not insert. | | enable_dynamic_field | boolean | No | true | Enable create table with dynamic field. | | batch_size | int | No | 1000 | Write batch size. When the number of buffered records reaches `batch_size` or the time reaches `checkpoint.interval`, it will trigger a write flush | | partition_key | String | No | | Milvus partition key field | | create_index | boolean | No | false | Automatically create vector indexes for collection to improve query performance. | | load_collection | boolean | No | false | Load collection into Milvus memory for immediate query availability. | | collection_description | Map | No | {} | Collection descriptions map where key is collection name and value is description. | ## Task Example ### Basic Configuration ```bash sink { Milvus { url = "http://127.0.0.1:19530" token = "username:password" batch_size = 1000 } } ``` ### Advanced Configuration with Index and Loading ```bash sink { Milvus { url = "http://127.0.0.1:19530" token = "username:password" batch_size = 1000 create_index = true load_collection = true collection_description = { "user_vectors" = "User embedding vectors for recommendation" "product_vectors" = "Product feature vectors for search" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/MongoDB.md ================================================ import ChangeLog from '../changelog/connector-mongodb.md'; # MongoDB > MongoDB Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) **Tips** > 1.If you want to use CDC-written features, recommend enable the upsert-enable configuration. ## Description The MongoDB Connector provides the ability to read and write data from and to MongoDB. This document describes how to set up the MongoDB connector to run data writers against MongoDB. ## Supported DataSource Info In order to use the Mongodb connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|---------------------------------------------------------------------------------------| | MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-mongodb) | ## Data Type Mapping The following table lists the field data type mapping from MongoDB BSON type to Seatunnel data type. | Seatunnel Data Type | MongoDB BSON Type | |---------------------|-------------------| | STRING | ObjectId | | STRING | String | | BOOLEAN | Boolean | | BINARY | Binary | | INTEGER | Int32 | | TINYINT | Int32 | | SMALLINT | Int32 | | BIGINT | Int64 | | DOUBLE | Double | | FLOAT | Double | | DECIMAL | Decimal128 | | Date | Date | | Timestamp | Timestamp[Date] | | ROW | Object | | ARRAY | Array | **Tips** > 1.When using SeaTunnel to write Date and Timestamp types to MongoDB, both will produce a Date data type in MongoDB, but the precision will be different. The data generated by the SeaTunnel Date type has second-level precision, while the data generated by the SeaTunnel Timestamp type has millisecond-level precision.
> 2.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
## Sink Options | Name | Type | Required | Default | Description | |-----------------------|----------|----------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | uri | String | Yes | - | The MongoDB standard connection uri. eg. mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true. | | database | String | Yes | - | The name of the MongoDB database to read or write to. When configuring multiple tables at the source, you can use `${database_name}` as a placeholder, for example: `database = "${database_name}_test_database"` . | | collection | String | Yes | - | The name of the MongoDB collection to read or write. When configuring multiple tables at the source end, you can use `${table_name}`,`${schema_name}`,`${table_name}` as placeholders, for example: `collection = "${database_name}_${schema_name}_${table_name}_check"` | | buffer-flush.max-rows | String | No | 1000 | Specifies the maximum number of buffered rows per batch request. | | buffer-flush.interval | String | No | 30000 | Specifies the maximum interval of buffered rows per batch request, the unit is millisecond. | | retry.max | String | No | 3 | Specifies the max number of retry if writing records to database failed. | | retry.interval | Duration | No | 1000 | Specifies the retry time interval if writing records to database failed, the unit is millisecond. | | upsert-enable | Boolean | No | false | Whether to write documents via upsert mode. | | primary-key | List | No | - | The primary keys for upsert/update. Keys are in `["id","name",...]` format for properties. | | transaction | Boolean | No | false | Whether to use transactions in MongoSink (requires MongoDB 4.2+). | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details | | data_save_mode | String | No | APPEND_DATA | The data saving mode of mongodb,Option introduction,`DROP_DATA`:The collection will be cleared before inserting data;`APPEND_DATA`:Append data ;`ERROR_WHEN_DATA_EXISTS`:An error will be reported if there is data in the collection. | ### Tips > 1.The data flushing logic of the MongoDB Sink Connector is jointly controlled by three parameters: `buffer-flush.max-rows`, `buffer-flush.interval`, and `checkpoint.interval`.
> Data flushing will be triggered if any of these conditions are met.
> 2.Compatible with the historical parameter `upsert-key`. If `upsert-key` is set, please do not set `primary-key`.
## How to Create a MongoDB Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that writes randomly generated data to a MongoDB database: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 1000 } source { FakeSource { row.num = 2 bigint.min = 0 bigint.max = 10000000 split.num = 1 split.read-interval = 300 schema { fields { c_bigint = bigint } } } } sink { MongoDB{ uri = mongodb://user:password@127.0.0.1:27017 database = "test" collection = "test" } } ``` ## Parameter Interpretation ### MongoDB Database Connection URI Examples Unauthenticated single node connection: ```bash mongodb://127.0.0.0:27017/mydb ``` Replica set connection: ```bash mongodb://127.0.0.0:27017/mydb?replicaSet=xxx ``` Authenticated replica set connection: ```bash mongodb://admin:password@127.0.0.0:27017/mydb?replicaSet=xxx&authSource=admin ``` Multi-node replica set connection: ```bash mongodb://127.0.0..1:27017,127.0.0..2:27017,127.0.0.3:27017/mydb?replicaSet=xxx ``` Sharded cluster connection: ```bash mongodb://127.0.0.0:27017/mydb ``` Multiple mongos connections: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb ``` Note: The username and password in the URI must be URL-encoded before being concatenated into the connection string. ### Buffer Flush ```bash sink { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" buffer-flush.max-rows = 2000 buffer-flush.interval = 1000 } } ``` ### Why is Not Recommended to Use Transactions for Operation? Although MongoDB has fully supported multi-document transactions since version 4.2, it doesn't mean that everyone should use them recklessly. Transactions are equivalent to locks, node coordination, additional overhead, and performance impact. Instead, the principle for using transactions should be: avoid using them if possible. The necessity for using transactions can be greatly avoided by designing systems rationally. ### Idempotent Writes By specifying a clear primary key and using the upsert method, exactly-once write semantics can be achieved. If `primary-key` and `upsert-enable` is defined in the configuration, the MongoDB sink will use upsert semantics instead of regular INSERT statements. We combine the primary keys declared in upsert-key as the MongoDB reserved primary key and use upsert mode for writing to ensure idempotent writes. In the event of a failure, Seatunnel jobs will recover from the last successful checkpoint and reprocess, which may result in duplicate message processing during recovery. It is highly recommended to use upsert mode, as it helps to avoid violating database primary key constraints and generating duplicate data if records need to be reprocessed. ```bash sink { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" upsert-enable = true primary-key = ["name","status"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Mysql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # MySQL > JDBC Mysql Sink Connector ## Support Mysql Version - 5.5/5.6/5.7/8.0/8.1/8.2/8.3/8.4 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------------| | Mysql | Different dependency version has different driver class. | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [Download](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | ## Data Type Mapping | Mysql Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
INT UNSIGNED | BOOLEAN | | TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | | INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT
FLOAT UNSIGNED | FLOAT | | DOUBLE
DOUBLE UNSIGNED | DOUBLE | | CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
TIMESTAMP | TIMESTAMP | | TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | | GEOMETRY
UNKNOWN | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | | data_save_mode | Enum | No | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | | custom_sql | String | No | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task only has `insert`, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your mysql. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### Exactly-once > For accurate write scene we guarantee accurate once ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### CDC(Change Data Capture) Event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] field_ide = UPPERCASE schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### Multiple Table Sync #### Example 1: MySQL CDC Multiple Table Sync > Sync multiple tables from MySQL CDC to target MySQL database, using placeholders for dynamic table name mapping ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Mysql { url = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true database = "${database_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` #### Example 2: JDBC Source Multiple Table Sync to MySQL > Batch sync multiple tables from MySQL using JDBC Source to another MySQL database ``` env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = com.mysql.cj.jdbc.Driver url = "jdbc:mysql://localhost:3306/source_db" username = "root" password = "123456" table_list = [ { table_path = "source_db.table_1" }, { table_path = "source_db.table_2" } ] } } transform { } sink { Mysql { url = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true database = "${database_name}_target" table = "${table_name}_copy" primary_keys = ["${primary_key}"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Neo4j.md ================================================ import ChangeLog from '../changelog/connector-neo4j.md'; # Neo4j > Neo4j sink connector ## Description Write data to Neo4j. `neo4j-java-driver` version 4.4.9 ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------------------|---------|----------|---------------| | uri | String | Yes | - | | username | String | No | - | | password | String | No | - | | max_batch_size | Integer | No | - | | write_mode | String | No | OneByOne | | bearer_token | String | No | - | | kerberos_ticket | String | No | - | | database | String | Yes | - | | query | String | Yes | - | | queryParamPosition | Object | Yes | - | | max_transaction_retry_time | Long | No | 30 | | max_connection_timeout | Long | No | 30 | | common-options | config | no | - | ### uri [string] The URI of the Neo4j database. Refer to a case: `neo4j://localhost:7687` ### username [string] username of the Neo4j ### password [string] password of the Neo4j. required if `username` is provided ### max_batch_size [Integer] max_batch_size refers to the maximum number of data entries that can be written in a single transaction when writing to a database. ### write_mode The default value is oneByOne, or set it to "Batch" if you want to have the ability to write in batches ```cypher unwind $ttt as row create (n:Label) set n.name = row.name,n.age = rw.age ``` "ttt" represents a batch of data.,"ttt" can be any arbitrary string as long as it matches the configured "batch_data_variable". ### bearer_token [string] base64 encoded bearer token of the Neo4j. for Auth. ### kerberos_ticket [string] base64 encoded kerberos ticket of the Neo4j. for Auth. ### database [string] database name. ### query [string] Query statement. contain parameter placeholders that are substituted with the corresponding values at runtime ### queryParamPosition [object] position mapping information for query parameters. key name is parameter placeholder name. associated value is position of field in input data row. ### max_transaction_retry_time [long] maximum transaction retry time(seconds). transaction fail if exceeded ### max_connection_timeout [long] The maximum amount of time to wait for a TCP connection to be established (seconds) ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## WriteOneByOneExample ``` sink { Neo4j { uri = "neo4j://localhost:7687" username = "neo4j" password = "1234" database = "neo4j" max_transaction_retry_time = 10 max_connection_timeout = 10 query = "CREATE (a:Person {name: $name, age: $age})" queryParamPosition = { name = 0 age = 1 } } } ``` ## WriteBatchExample > The unwind keyword provided by cypher supports batch writing, and the default variable for a batch of data is batch. If you write a batch write statement, then you should declare cypher:unwind $batch as row to do someting ``` sink { Neo4j { uri = "bolt://localhost:7687" username = "neo4j" password = "neo4j" database = "neo4j" max_batch_size = 1000 write_mode = "BATCH" max_transaction_retry_time = 3 max_connection_timeout = 10 query = "unwind $batch as row create(n:MyLabel) set n.name = row.name,n.age = row.age" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/ObsFile.md ================================================ import ChangeLog from '../changelog/connector-file-obs.md'; # ObsFile > Obs file sink connector ## Support those engines > Spark > > Flink > > Seatunnel Zeta ## Key features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Description Output data to huawei cloud obs file system. If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OBS and this connector need some hadoop dependencies. It only supports hadoop version **2.9.X+**. ## Required Jar List | jar | supported versions | maven | |--------------------|-----------------------------|-------------------------------------------------------------------------------------------------------| | hadoop-huaweicloud | support version >= 3.1.1.29 | [Download](https://repo.huaweicloud.com/artifactory/sdk_public/org/apache/hadoop/hadoop-huaweicloud/) | | esdk-obs-java | support version >= 3.19.7.3 | [Download](https://repo.huaweicloud.com/artifactory/sdk_public/com/huawei/storage/esdk-obs-java/) | | okhttp | support version >= 3.11.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | | okio | support version >= 1.14.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | > Please download the support list corresponding to 'Maven' and copy them to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory. > > And copy all jars to $SEATUNNEL_HOME/lib/ ## Options | name | type | required | default | description | |----------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The target dir path. | | bucket | string | yes | - | The bucket address of obs file system, for example: `obs://obs-bucket-name`. | | access_key | string | yes | - | The access key of obs file system. | | access_secret | string | yes | - | The access secret of obs file system. | | endpoint | string | yes | - | The endpoint of obs file system. | | custom_filename | boolean | no | false | Whether you need custom the filename. | | file_name_expression | string | no | "${transactionId}" | Describes the file expression which will be created into the `path`. Only used when custom_filename is true. [Tips](#file_name_expression) | | filename_time_format | string | no | "yyyy.MM.dd" | Specify the time format of the `path`. Only used when custom_filename is true. [Tips](#filename_time_format) | | file_format_type | string | no | "csv" | Supported file types. [Tips](#file_format_type) | | field_delimiter | string | no | '\001' | The separator between columns in a row of data.Only used when file_format is text. | | row_delimiter | string | no | "\n" | The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Partition data based on selected fields. Only used then have_partition is true. | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true.[Tips](#partition_dir_expression) | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true.[Tips](#is_partition_field_write_in_file) | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns.[Tips](#sink_columns) | | is_enable_transaction | boolean | no | true | [Tips](#is_enable_transaction) | | batch_size | int | no | 1000000 | [Tips](#batch_size) | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | compress_codec | string | no | none | [Tips](#compress_codec) | | common-options | object | no | - | [Tips](#common_options) | | max_rows_in_memory | int | no | - | When File Format is Excel,The maximum number of data items that can be cached in the memory.Only used when file_format is excel. | | sheet_name | string | no | Sheet${Random number} | Writer the sheet of the workbook. Only used when file_format is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file format_type is excel. | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### Tips #### file_name_expression > Only used when `custom_filename` is `true` > > `file_name_expression` describes the file expression which will be created into the `path`. > > We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, > > `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. #### filename_time_format > Only used when `custom_filename` is `true` > > When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | #### file_format_type > We supported as the following file types: > > `text` `json` `csv` `orc` `parquet` `excel` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. #### partition_dir_expression > Only used when `have_partition` is `true`. > > If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. > > Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. #### is_partition_field_write_in_file > Only used when `have_partition` is `true`. > > If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. > > For example, if you want to write a Hive Data File, Its value should be `false`. #### sink_columns > Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. > The order of the fields determines the order in which the file is actually written. #### is_enable_transaction > If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. > > Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. #### batch_size > The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. #### compress_codec > The compress codec of files and the details that supported as the following shown: > > - txt: `lzo` `none` > - json: `lzo` `none` > - csv: `lzo` `none` > - orc: `lzo` `snappy` `lz4` `zlib` `none` > - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Please note that excel type does not support any compression format #### merge_update_event > Only used when file_format_type is canal_json,debezium_json or maxwell_json. > When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data #### common options > Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Task Example ### text file > For text file format with `have_partition` and `custom_filename` and `sink_columns` ```hocon ObsFile { path="/seatunnel/text" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### parquet file > For parquet file format with `have_partition` and `sink_columns` ```hocon ObsFile { path = "/seatunnel/parquet" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] } ``` ### orc file > For orc file format simple config ```hocon ObsFile { path="/seatunnel/orc" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "orc" } ``` ### json file > For json file format simple config ```hcocn ObsFile { path = "/seatunnel/json" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "json" } ``` ### excel file > For excel file format simple config ```hcocn ObsFile { path = "/seatunnel/excel" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "excel" } ``` ### csv file > For csv file format simple config ```hcocn ObsFile { path = "/seatunnel/csv" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "csv" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/OceanBase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # OceanBase > JDBC OceanBase Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| | OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example: cp oceanbase-client-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping ### Mysql Mode | Mysql Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
INT UNSIGNED | BOOLEAN | | TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | | INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT
FLOAT UNSIGNED | FLOAT | | DOUBLE
DOUBLE UNSIGNED | DOUBLE | | CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
TIMESTAMP | TIMESTAMP | | TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | | GEOMETRY
UNKNOWN | Not supported yet | ### Oracle Mode | Oracle Data type | SeaTunnel Data type | |-----------------------------------------------------------|---------------------| | Number(p), p <= 9 | INT | | Number(p), p <= 18 | BIGINT | | Number(p), p > 18 | DECIMAL(38,18) | | REAL
BINARY_FLOAT | FLOAT | | BINARY_DOUBLE | DOUBLE | | CHAR
NCHAR
NVARCHAR2
NCLOB
CLOB
ROWID | STRING | | DATE | DATE | | TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
RAW
LONG RAW
BFILE | BYTES | | UNKNOWN | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your mysql. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:oceanbase://localhost:2883/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { jdbc { url = "jdbc:oceanbase://localhost:2883/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### CDC(Change Data Capture) Event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { url = "jdbc:oceanbase://localhost:3306/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Oracle.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Oracle > JDBC Oracle Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| | Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATUNNEL_HOME/lib/
> To support the i18n character set, copy the orai18n.jar to the $SEATUNNEL_HOME/lib/ directory. ## Data Type Mapping | Oracle Data Type | SeaTunnel Data Type | |--------------------------------------------------------------------------------------|---------------------| | INTEGER | INT | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
REAL | FLOAT | | CHAR
NCHAR
NVARCHAR2
VARCHAR2
LONG
ROWID
NCLOB
CLOB
| STRING | | DATE | DATE | | TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
RAW
LONG RAW
BFILE | BYTES | ## Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Oracle the value is `oracle.jdbc.OracleDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database | | batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, Oracle is `oracle.jdbc.xa.client.OracleXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | | data_save_mode | Enum | No | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | | custom_sql | String | No | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your Oracle. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 generate_sink_sql = true database = XE table = "TEST.TEST_TABLE" } } ``` ### Exactly-once > For accurate write scene we guarantee accurate once ``` sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" max_retries = 0 username = root password = 123456 query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" is_exactly_once = "true" xa_data_source_class_name = "oracle.jdbc.xa.client.OracleXADataSource" } } ``` ### CDC(Change Data Capture) Event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 generate_sink_sql = true # You need to configure both database and table database = XE table = "TEST.TEST_TABLE" primary_keys = ["ID"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/OssFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss.md'; # OssFile > Oss file sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Usage Dependency ### For Spark/Flink Engine 1. You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. 2. You must ensure `hadoop-aliyun-xx.jar`, `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` in `${SEATUNNEL_HOME}/plugins/` dir and the version of `hadoop-aliyun` jar need equals your hadoop version which used in spark/flink and `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` version needs to be the version corresponding to the `hadoop-aliyun` version. Eg: `hadoop-aliyun-3.1.4.jar` dependency `aliyun-sdk-oss-3.4.1.jar` and `jdom-1.1.jar`. ### For SeaTunnel Zeta Engine 1. You must ensure `seatunnel-hadoop3-3.1.4-uber.jar`, `aliyun-sdk-oss-3.4.1.jar`, `hadoop-aliyun-3.1.4.jar` and `jdom-1.1.jar` in `${SEATUNNEL_HOME}/lib/` dir. ## Key features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Data Type Mapping If write to `csv`, `text`, `json` file type, All column will be string. ### Orc File Type | SeaTunnel Data Type | Orc Data Type | |----------------------|-----------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | BYTE | | SMALLINT | SHORT | | INT | INT | | BIGINT | LONG | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
TIMESTAMP | TIMESTAMP | | ROW | STRUCT | | NULL | UNSUPPORTED DATA TYPE | | ARRAY | LIST | | Map | Map | ### Parquet File Type | SeaTunnel Data Type | Parquet Data Type | |----------------------|-----------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | INT_8 | | SMALLINT | INT_16 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
TIMESTAMP | TIMESTAMP_MILLIS | | ROW | GroupType | | NULL | UNSUPPORTED DATA TYPE | | ARRAY | LIST | | Map | Map | ## Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | The oss path to write file in. | | | tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a OSS dir. | | bucket | string | yes | - | | | access_key | string | yes | - | | | access_secret | string | yes | - | | | endpoint | string | yes | - | | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format_type is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format_type is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file format_type is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before turning on the synchronous task, do different treatment of the target path | | data_save_mode | Enum | no | APPEND_DATA | Before opening the synchronous task, the data file in the target path is differently processed | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### path [string] The target dir path is required. ### bucket [string] The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` ### access_key [string] The access key of oss file system. ### access_secret [string] The access secret of oss file system. ### endpoint [string] The endpoint of oss file system. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [String] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${Now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### schema_save_mode [Enum] Before turning on the synchronous task, do different treatment of the target path. Option introduction: `RECREATE_SCHEMA` :Will be created when the path does not exist. If the path already exists, delete the path and recreate it. `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the path does not exist, use the path when the path is existed. `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the path does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before opening the synchronous task, the data file in the target path is differently processed. Option introduction: `DROP_DATA`: use the path but delete data files in the path. `APPEND_DATA`:use the path, and add new files in the path for write data. `ERROR_WHEN_DATA_EXISTS`:When there are some data files in the path, an error will is reported. ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## How to Create an Oss Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that reads data from Fake Source and writes it to the Oss: For text file format with `have_partition` and `custom_filename` and `sink_columns` ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to product data source { FakeSource { schema = { fields { name = string age = int } } } } # write data to Oss sink { OssFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` For parquet file format with `have_partition` and `sink_columns` ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to product data source { FakeSource { schema = { fields { name = string age = int } } } } # Write data to Oss sink { OssFile { path = "/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` For orc file format simple config ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to product data source { FakeSource { schema = { fields { name = string age = int } } } } # Write data to Oss sink { OssFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ### Multiple Table For extract source metadata from upstream, you can use `${database_name}`, `${table_name}` and `${schema_name}` in the path. ```bash env { parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "fake1" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } }, { schema = { table = "fake2" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } ] } } sink { OssFile { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/tmp/fake_empty/text/${table_name}" row_delimiter = "\n" partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" file_format_type = "text" filename_time_format = "yyyy.MM.dd" is_enable_transaction = true compress_codec = "lzo" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### Tips > 1.[SeaTunnel Deployment Document](../../getting-started/locally/deployment.md). ## Changelog ================================================ FILE: docs/en/connectors/sink/OssJindoFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss-jindo.md'; # OssJindoFile > OssJindo file sink connector ## Description Output data to oss file system using jindo api. :::tip You need to download [jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) and then unzip it, copy jindo-sdk-4.6.1.jar and jindo-core-4.6.1.jar from lib to ${SEATUNNEL_HOME}/lib. If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OSS and this connector need some hadoop dependencies. It only supports hadoop version **2.9.X+**. ::: ## Key features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Options | Name | Type | Required | Default | Description | |---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | | | tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a OSS dir. | | bucket | string | yes | - | | | access_key | string | yes | - | | | access_secret | string | yes | - | | | endpoint | string | yes | - | | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format_type is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format_type is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format_type is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### path [string] The target dir path is required. ### bucket [string] The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` ### access_key [string] The access key of oss file system. ### access_secret [string] The access secret of oss file system. ### endpoint [string] The endpoint of oss file system. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Example For text file format with `have_partition` and `custom_filename` and `sink_columns` ```hocon OssJindoFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` For parquet file format with `sink_columns` ```hocon OssJindoFile { path = "/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "parquet" sink_columns = ["name","age"] } ``` For orc file format simple config ```bash OssJindoFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Paimon.md ================================================ import ChangeLog from '../changelog/connector-paimon.md'; # Paimon > Paimon sink connector ## Description Sink connector for Apache Paimon. It can support cdc mode 、auto create table. ### Comparison between SeaTunnel and Paimon version | Seatunnel Version | Paimon Version | |-------------------|------------------| | 2.3.2 - 2.3.3 | 0.4-SNAPSHOT | | 2.3.4 | 0.6-SNAPSHOT | | 2.3.5 - 2.3.11 | 0.7.0-incubating | | 2.3.12 - 2.3.13 | 1.1.1 | ### Key Considerations for Upgrading Paimon from `0.7.0-incubating` to `1.1.1` 1. **Backup Recommendations** Although compatibility is ensured, it is strongly recommended to backup critical data, especially the metadata directory, before initiating the upgrade. 2. **Gradual Upgrade Process** - **Test Environment Validation**: First validate the upgrade process in a staging environment. - **Update JAR Files**: Replace Paimon JAR files with version 1.1.1. - **Automatic Format Upgrade**: The system will automatically detect and upgrade older file formats. 3. **Configuration Check** Review your configurations to ensure no deprecated options are in use. While most configurations remain backward-compatible, deprecated settings may require updates. 4. **Post-Upgrade Validation** Verify the following after upgrading: - **Read/Write Operations**: Ensure data ingestion and retrieval workflows function normally. - **Query Performance**: Confirm that query response times meet expectations. - **New Feature Verification**: Test all newly introduced features (e.g., time travel, enhanced compaction) to ensure proper functionality. **Note**: These steps help minimize risks and ensure a smooth transition to the stable version 1.1.1. ## Supported DataSource Info | Datasource | Dependent | Maven | |------------|-----------|---------------------------------------------------------------------------| | Paimon | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Paimon | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## Database Dependency > In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. ## Key features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | Description | |------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| | warehouse | String | Yes | - | Paimon warehouse path | | catalog_type | String | No | filesystem | Catalog type of Paimon, support filesystem and hive | | catalog_uri | String | No | - | Catalog uri of Paimon, only needed when catalog_type is hive | | database | String | Yes | - | The database you want to access | | table | String | Yes | - | The table you want to access | | user | String | No | - | Paimon user to access table | | password | String | No | - | Paimon user password to access table | | hdfs_site_path | String | No | - | The path of hdfs-site.xml | | schema_save_mode | Enum | No | CREATE_SCHEMA_WHEN_NOT_EXIST | The schema save mode | | data_save_mode | Enum | No | APPEND_DATA | The data save mode | | paimon.table.primary-keys | String | No | - | Default comma-separated list of columns (primary key) that identify a row in tables.(Notice: The partition field needs to be included in the primary key fields) | | paimon.table.partition-keys | String | No | - | Default comma-separated list of partition fields to use when creating tables. | | paimon.table.write-props | Map | No | - | Properties passed through to paimon table initialization, [reference](https://paimon.apache.org/docs/master/maintenance/configurations/#coreoptions). | | paimon.hadoop.conf | Map | No | - | Properties in hadoop conf | | paimon.hadoop.conf-path | String | No | - | The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files | | paimon.table.non-primary-key | Boolean | false | - | Switch to create `table with PK` or `table without PK`. true : `table without PK`, false : `table with PK` | | branch | String | No | main | The branch name of Paimon table to write data to. If the branch does not exist, an exception will be thrown. | ## Checkpoint in batch mode When you set `checkpoint.interval` to a value greater than 0 in batch mode, the paimon connector will commit the data to the paimon table when the checkpoint triggers after a certain number of records have been written. At this moment, the written data in paimon that is visible. However, if you do not set `checkpoint.interval` in batch mode, the paimon sink connector will commit the data after all records are written. The written data in paimon that is not visible until the batch task completes. ## Changelog You must configure the `changelog-producer=input` option to enable the changelog producer mode of the paimon table. If you use the auto-create table function of paimon sink, you can configure this property in `paimon.table.write-props`. The changelog producer mode of the paimon table has [four mode](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/) which is `none`、`input`、`lookup` and `full-compaction`. All `changelog-producer` modes are currently supported. The default is `none`. * [`none`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#none) * [`input`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#input) * [`lookup`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#lookup) * [`full-compaction`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#full-compaction) > note: > When you use a streaming mode to read paimon table,different mode will produce [different results](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/Paimon.md#changelog)。 ## Filesystems The Paimon connector supports writing data to multiple file systems. Currently, the supported file systems are hdfs and s3. If you use the s3 filesystem. You can configure the `fs.s3a.access-key`、`fs.s3a.secret-key`、`fs.s3a.endpoint`、`fs.s3a.path.style.access`、`fs.s3a.aws.credentials.provider` properties in the `paimon.hadoop.conf` option. Besides, the warehouse should start with `s3a://`. ## Schema Evolution Cdc Ingestion supports a limited number of schema changes. Currently supported schema changes includes: * Adding columns. * Modify column. More specifically, If you modify the column type, the following changes are supported: * altering from a string type (char, varchar, text) to another string type with longer length, * altering from a binary type (binary, varbinary, blob) to another binary type with longer length, * altering from an integer type (tinyint, smallint, int, bigint) to another integer type with wider range, * altering from a floating-point type (float, double) to another floating-point type with wider range, are supported. > Note: > > If {oldType} and {newType} belongs to the same type family, but old type has higher precision than new type. Ignore this convert. * Drop columns. * Change columns. ## Examples ### Schema evolution ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Paimon { warehouse = "file:///tmp/paimon" database = "mysql_to_paimon" table = "products" } } ``` ### Single table ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" } } ``` ### Single table with s3 filesystem ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } sink { Paimon { warehouse = "s3a://test/" database = "seatunnel_namespace11" table = "st_test" paimon.hadoop.conf = { fs.s3a.access-key=G52pnxg67819khOZ9ezX fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF fs.s3a.endpoint="http://minio4:9000" fs.s3a.path.style.access=true fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider } } } ``` ### Single table(Specify hadoop HA config and kerberos config) ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" security.kerberos.login.principal = "your-kerberos-principal" security.kerberos.login.keytab = "your-kerberos-keytab-path" } } } ``` ### Single table(Specify hadoop HA config with hadoop_user_name) ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.hadoop.conf = { hadoop_user_name = "hdfs" fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" security.kerberos.login.principal = "your-kerberos-principal" security.kerberos.login.keytab = "your-kerberos-keytab-path" } } } ``` ### Single table(Hive catalog) ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] } { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100] }, { kind = DELETE fields = [2, "B", 100] } ] } } sink { Paimon { schema_save_mode = "RECREATE_SCHEMA" catalog_name="seatunnel_test" catalog_type="hive" catalog_uri="thrift://hadoop04:9083" warehouse="hdfs:///tmp/seatunnel" database="seatunnel_test" table="st_test3" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ### Single table with write props of paimon ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.table.write-props = { bucket = 2 file.format = "parquet" } paimon.table.partition-keys = "dt" paimon.table.primary-keys = "pk_id,dt" } } ``` #### Write with the `changelog-producer` attribute ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name = "seatunnel_test" warehouse = "file:///tmp/seatunnel/paimon/hadoop-sink/" database = "seatunnel" table = "role" paimon.table.write-props = { changelog-producer = full-compaction changelog-tmp-path = /tmp/paimon/changelog } } } ``` ### Write to dynamic bucket table Single dynamic bucket table with write props of paimon,operates on the primary key table and bucket is -1. > Notes: > - Currently only the ordinary dynamic bucket mode is supported (the primary key must include all partition fields). > - When running in a cluster environment, `parallelism` must be set to `1`; otherwise, data duplication may occur. #### core options Please [reference](https://paimon.apache.org/docs/master/primary-key-table/data-distribution/#dynamic-bucket) | name | type | required | default values | Description | |--------------------------------|------|----------|----------------|------------------------------------------------| | dynamic-bucket.target-row-num | long | yes | 2000000L | controls the target row number for one bucket. | | dynamic-bucket.initial-buckets | int | no | | controls the number of initialized bucket. | ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.table.write-props = { bucket = -1 dynamic-bucket.target-row-num = 50000 } paimon.table.partition-keys = "dt" paimon.table.primary-keys = "pk_id,dt" } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="${database_name}_test" table="${table_name}_test" } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="${schema_name}_test" table="${table_name}_test" } } ``` ### paimon enable privilege #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Paimon { catalog_name = "seatunnel_test" warehouse = "file:///tmp/seatunnel/paimon/hadoop-sink/" database = "${database_name}" table = "${table_name}" user = "paimon" password = "******" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Phoenix.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Phoenix > Phoenix sink connector ## Description Write Phoenix data through [Jdbc connector](Jdbc.md). Support Batch mode and Streaming mode. The tested Phoenix version is 4.xx and 5.xx On the underlying implementation, through the jdbc driver of Phoenix, execute the upsert statement to write data to HBase. Two ways of connecting Phoenix with Java JDBC. One is to connect to zookeeper through JDBC, and the other is to connect to queryserver through JDBC thin client. > Tips: By default, the (thin) driver jar is used. If you want to use the (thick) driver or other versions of Phoenix (thin) driver, you need to recompile the jdbc connector module > > Tips: Not support exactly-once semantics (XA transaction is not yet supported in Phoenix). ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options ### driver [string] if you use phoenix (thick) driver the value is `org.apache.phoenix.jdbc.PhoenixDriver` or you use (thin) driver the value is `org.apache.phoenix.queryserver.client.Driver` ### url [string] if you use phoenix (thick) driver the value is `jdbc:phoenix:localhost:2182/hbase` or you use (thin) driver the value is `jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example use thick client drive ``` Jdbc { driver = org.apache.phoenix.jdbc.PhoenixDriver url = "jdbc:phoenix:localhost:2182/hbase" query = "upsert into test.sink(age, name) values(?, ?)" } ``` use thin client drive ``` Jdbc { driver = org.apache.phoenix.queryserver.client.Driver url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" query = "upsert into test.sink(age, name) values(?, ?)" } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/PostgreSql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # PostgreSql > JDBC PostgreSql Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | If you want to manipulate the GEOMETRY/GEOGRAPHY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/
> If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | PostgreSQL Data Type | SeaTunnel Data Type | |--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
| BOOLEAN | | _BOOL
| ARRAY<BOOLEAN> | | BYTEA
| BYTES | | _BYTEA
| ARRAY<TINYINT> | | INT2
SMALLSERIAL
INT4
SERIAL
| INT | | _INT2
_INT4
| ARRAY<INT> | | INT8
BIGSERIAL
| BIGINT | | _INT8
| ARRAY<BIGINT> | | FLOAT4
| FLOAT | | _FLOAT4
| ARRAY<FLOAT> | | FLOAT8
| DOUBLE | | _FLOAT8
| ARRAY<DOUBLE> | | NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | | NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | | BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB
UUID | STRING | | _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | | TIMESTAMP
| TIMESTAMP | | TIME
| TIME | | DATE
| DATE | | OTHER DATA TYPES | NOT SUPPORTED YET | ## Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test
if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use PostgreSQL the value is `org.postgresql.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority.The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, PostgreSQL is `org.postgresql.xa.PGXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | | data_save_mode | Enum | no | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | | custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ### table [string] Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. for example: 1. ${schema_name}.${table_name}_test 2. dbo.tt_${table_name}_sink 3. public.sink_table ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `CUSTOM_PROCESSING`:User defined processing `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ### custom_sql [String] When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your PostgreSQL. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = root password = 123456 query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { Jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = org.postgresql.Driver username = root password = 123456 generate_sink_sql = true database = test table = "public.test_table" } } ``` ### Exactly-once > For accurate write scene we guarantee accurate once ``` sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" max_retries = 0 username = root password = 123456 query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" } } ``` ### CDC(Change Data Capture) Event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = root password = 123456 generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] field_ide = UPPERCASE } } ``` ### Save mode function ``` sink { Jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = org.postgresql.Driver username = root password = 123456 generate_sink_sql = true database = test table = "public.test_table" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Prometheus.md ================================================ import ChangeLog from '../changelog/connector-prometheus.md'; # Prometheus > Prometheus sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to launch web hooks using data. > For example, if the data from upstream is [`label: {"__name__": "test1"}, value: 1.2.3,time:2024-08-15T17:00:00`], the body content is the following: `{"label":{"__name__": "test1"}, "value":"1.23","time":"2024-08-15T17:00:00"}` **Tips: Prometheus sink only support `post json` webhook and the data from source will be treated as body content in web hook.And does not support passing past data** ## Supported DataSource Info In order to use the Http connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------------------------------------| | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-prometheus) | ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url | | headers | Map | No | - | Http headers | | retry | Int | No | - | The max retry times if request http return to `IOException` | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | | socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | | key_timestamp | Int | NO | - | prometheus timestamp key . | | key_label | String | yes | - | prometheus label key | | key_value | Double | yes | - | prometheus value | | batch_size | Int | false | 1024 | prometheus batch size write | | flush_interval | Long | false | 300000L | prometheus flush commit interval | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Example simple: ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { c_map = "map" c_double = double c_timestamp = timestamp } } plugin_output = "fake" rows = [ { kind = INSERT fields = [{"__name__": "test1"}, 1.23, "2024-08-15T17:00:00"] }, { kind = INSERT fields = [{"__name__": "test2"}, 1.23, "2024-08-15T17:00:00"] } ] } } sink { Prometheus { url = "http://prometheus:9090/api/v1/write" key_label = "c_map" key_value = "c_double" key_timestamp = "c_timestamp" batch_size = 1 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Pulsar.md ================================================ import ChangeLog from '../changelog/connector-pulsar.md'; # Pulsar > Pulsar sink connector ## Support Those Engines > Spark
> Flink
> Seatunnel Zeta
## Key features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Description Sink connector for Apache Pulsar. ## Supported DataSource Info | Datasource | Supported Versions | |------------|--------------------| | Pulsar | Universal | ## Sink Options | Name | Type | Required | Default | Description | |----------------------|--------|----------|---------------------|------------------------------------------------------------------------------------------------------------------| | topic | String | Yes | - | sink pulsar topic | | client.service-url | String | Yes | - | Service URL provider for Pulsar service. | | admin.service-url | String | Yes | - | The Pulsar service HTTP URL for the admin endpoint. | | auth.plugin-class | String | No | - | Name of the authentication plugin. | | auth.params | String | No | - | Parameters for the authentication plugin. | | format | String | No | json | Data format. The default format is json. Optional text format. | | field_delimiter | String | No | , | Customize the field delimiter for data format. | | semantics | Enum | No | AT_LEAST_ONCE | Consistency semantics for writing to pulsar. | | transaction_timeout | Int | No | 600 | The transaction timeout is specified as 10 minutes by default. | | pulsar.config | Map | No | - | In addition to the above parameters that must be specified by the Pulsar producer client. | | message.routing.mode | Enum | No | RoundRobinPartition | Default routing mode for messages to partition. | | partition_key_fields | array | No | - | Configure which fields are used as the key of the pulsar message. | | common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details. | ## Parameter Interpretation ### client.service-url [String] Service URL provider for Pulsar service. To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. You can assign Pulsar protocol URLs to specific clusters and use the Pulsar scheme. For example, `localhost`: `pulsar://localhost:6650,localhost:6651`. ### admin.service-url [String] The Pulsar service HTTP URL for the admin endpoint. For example, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. ### auth.plugin-class [String] Name of the authentication plugin. ### auth.params [String] Parameters for the authentication plugin. For example, `key1:val1,key2:val2` ### format [String] Data format. The default format is json. Optional text format. The default field separator is ",". If you customize the delimiter, add the "field_delimiter" option. ### field_delimiter [String] Customize the field delimiter for data format.The default field_delimiter is ','. ### semantics [Enum] Consistency semantics for writing to pulsar. Available options are EXACTLY_ONCE,NON,AT_LEAST_ONCE, default AT_LEAST_ONCE. If semantic is specified as EXACTLY_ONCE, we will use 2pc to guarantee the message is sent to pulsar exactly once. If semantic is specified as NON, we will directly send the message to pulsar, the data may duplicat/lost if job restart/retry or network error. ### transaction_timeout [Int] The transaction timeout is specified as 10 minutes by default. If the transaction does not commit within the specified timeout, the transaction will be automatically aborted. So you need to ensure that the timeout is greater than the checkpoint interval. ### pulsar.config [Map] In addition to the above parameters that must be specified by the Pulsar producer client, the user can also specify multiple non-mandatory parameters for the producer client, covering all the producer parameters specified in the official Pulsar document. ### message.routing.mode [Enum] Default routing mode for messages to partition. Available options are SinglePartition,RoundRobinPartition. If you choose SinglePartition, If no key is provided, The partitioned producer will randomly pick one single partition and publish all the messages into that partition, If a key is provided on the message, the partitioned producer will hash the key and assign message to a particular partition. If you choose RoundRobinPartition, If no key is provided, the producer will publish messages across all partitions in round-robin fashion to achieve maximum throughput. Please note that round-robin is not done per individual message but rather it's set to the same boundary of batching delay, to ensure batching is effective. ### partition_key_fields [String] Configure which fields are used as the key of the pulsar message. For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. Upstream data is the following: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. If not set partition key fields, the null message key will be sent to. The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'. The selected field must be an existing field in the upstream. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Pulsar Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ```hocon # Defining the runtime environment env { # You can set flink configuration here execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Pulsar { topic = "example" client.service-url = "localhost:pulsar://localhost:6650" admin.service-url = "http://my-broker.example.com:8080" plugin_output = "test" pulsar.config = { sendTimeoutMs = 30000 } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Qdrant.md ================================================ import ChangeLog from '../changelog/connector-qdrant.md'; # Qdrant > Qdrant Sink Connector ## Description [Qdrant](https://qdrant.tech/) is a high-performance vector search engine and vector database. This connector can be used to write data into a Qdrant collection. ## Data Type Mapping | SeaTunnel Data Type | Qdrant Data Type | |---------------------|------------------| | TINYINT | INTEGER | | SMALLINT | INTEGER | | INT | INTEGER | | BIGINT | INTEGER | | FLOAT | DOUBLE | | DOUBLE | DOUBLE | | BOOLEAN | BOOL | | STRING | STRING | | ARRAY | LIST | | FLOAT_VECTOR | DENSE_VECTOR | | BINARY_VECTOR | DENSE_VECTOR | | FLOAT16_VECTOR | DENSE_VECTOR | | BFLOAT16_VECTOR | DENSE_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_VECTOR | The value of the primary key column will be used as point ID in Qdrant. If no primary key is present, a random UUID will be used. ## Options | name | type | required | default value | |-----------------|--------|----------|---------------| | collection_name | string | yes | - | | batch_size | int | no | 64 | | host | string | no | localhost | | port | int | no | 6334 | | api_key | string | no | - | | use_tls | int | no | false | | common-options | | no | - | ### collection_name [string] The name of the Qdrant collection to read data from. ### batch_size [int] The batch size of each upsert request to Qdrant. ### host [string] The host name of the Qdrant instance. Defaults to "localhost". ### port [int] The gRPC port of the Qdrant instance. ### api_key [string] The API key to use for authentication if set. ### use_tls [bool] Whether to use TLS(SSL) connection. Required if using Qdrant cloud(https). ### common options Sink plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details. ## Changelog ================================================ FILE: docs/en/connectors/sink/Rabbitmq.md ================================================ import ChangeLog from '../changelog/connector-rabbitmq.md'; # Rabbitmq > Rabbitmq sink connector ## Description Used to write data to Rabbitmq. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------------------|---------|----------|---------------| | host | string | yes | - | | port | int | yes | - | | virtual_host | string | yes | - | | username | string | yes | - | | password | string | yes | - | | queue_name | string | yes | - | | url | string | no | - | | network_recovery_interval | int | no | - | | topology_recovery_enabled | boolean | no | - | | automatic_recovery_enabled | boolean | no | - | | use_correlation_id | boolean | no | false | | connection_timeout | int | no | - | | rabbitmq.config | map | no | - | | common-options | | no | - | | durable | boolean | no | true | | exclusive | boolean | no | false | | auto_delete | boolean | no | false | ### host [string] the default host to use for connections ### port [int] the default port to use for connections ### virtual_host [string] virtual host – the virtual host to use when connecting to the broker ### username [string] the AMQP user name to use when connecting to the broker ### password [string] the password to use when connecting to the broker ### url [string] convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host ### queue_name [string] the queue to write the message to ### durable [boolean] true: The queue will survive a server restart. false: The queue will be deleted on server restart. ### exclusive [boolean] true: The queue is used only by the current connection and will be deleted when the connection closes. false: The queue can be used by multiple connections. ### auto_delete [boolean] true: The queue will be deleted automatically when the last consumer unsubscribes. false: The queue will not be automatically deleted. ### schema [Config] #### fields [Config] the schema fields of upstream data. ### network_recovery_interval [int] how long will automatic recovery wait before attempting to reconnect, in ms ### topology_recovery_enabled [boolean] if true, enables topology recovery ### automatic_recovery_enabled [boolean] if true, enables connection recovery ### use_correlation_id [boolean] whether the messages received are supplied with a unique id to deduplicate messages (in case of failed acknowledgments). ### connection_timeout [int] connection TCP establishment timeout in milliseconds; zero for infinite ### rabbitmq.config [map] In addition to the above parameters that must be specified by the RabbitMQ client, the user can also specify multiple non-mandatory parameters for the client, covering [all the parameters specified in the official RabbitMQ document](https://www.rabbitmq.com/configure.html). ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ### durable - true: The queue will survive on server restart. - false: The queue will be deleted on server restart. ### exclusive - true: The queue is used only by the current connection and will be deleted when the connection closes. - false: The queue can be used by multiple connections. ### auto-delete - true: The queue will be deleted automatically when the last consumer unsubscribes. - false: The queue will not be automatically deleted. ## Example simple: ```hocon sink { RabbitMQ { host = "rabbitmq-e2e" port = 5672 virtual_host = "/" username = "guest" password = "guest" queue_name = "test1" rabbitmq.config = { requested-heartbeat = 10 connection-timeout = 10 } } } ``` ### Example 2 queue with durable, exclusive, auto_delete: ```hocon sink { RabbitMQ { host = "rabbitmq-e2e" port = 5672 virtual_host = "/" username = "guest" password = "guest" queue_name = "test1" durable = "true" exclusive = "false" auto_delete = "false" rabbitmq.config = { requested-heartbeat = 10 connection-timeout = 10 } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Redis.md ================================================ import ChangeLog from '../changelog/connector-redis.md'; # Redis > Redis sink connector ## Description Used to write data to Redis. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------------|---------|-----------------------|---------------| | host | string | yes when mode=single | - | | port | int | no | 6379 | | key | string | yes | - | | data_type | string | yes | - | | batch_size | int | no | 10 | | user | string | no | - | | auth | string | no | - | | db_num | int | no | 0 | | mode | string | no | single | | nodes | list | yes when mode=cluster | - | | format | string | no | json | | expire | long | no | -1 | | support_custom_key | boolean | no | false | | value_field | string | no | - | | hash_key_field | string | no | - | | hash_value_field | string | no | - | | field_delimiter | string | no | ',' | | common-options | | no | - | ### host [string] Redis host ### port [int] Redis port ### key [string] The value of key you want to write to redis. For example, if you want to use value of a field from upstream data as key, you can assign it to the field name. Upstream data is the following: | code | data | success | |------|----------------|---------| | 200 | get success | true | | 500 | internal error | false | If you assign field name to `code` and data_type to `key`, two data will be written to redis: 1. `200 -> {code: 200, data: get success, success: true}` 2. `500 -> {code: 500, data: internal error, success: false}` If you assign field name to `value` and data_type to `key`, only one data will be written to redis because `value` is not existed in upstream data's fields: 1. `value -> {code: 500, data: internal error, success: false}` Please see the data_type section for specific writing rules. Of course, the format of the data written here I just take json as an example, the specific or user-configured `format` prevails. ### data_type [string] Redis data types, support `key` `hash` `list` `set` `zset` - key > Each data from upstream will be updated to the configured key, which means the later data will overwrite the earlier data, and only the last data will be stored in the key. - hash > Each data from upstream will be split according to the field and written to the hash key, also the data after will overwrite the data before. - list > Each data from upstream will be added to the configured list key. - set > Each data from upstream will be added to the configured set key. - zset > Each data from upstream will be added to the configured zset key with a weight of 1. So the order of data in zset is based on the order of data consumption. > ### batch_size [int] ensure the batch write size in single-machine mode; no guarantees in cluster mode. ### user [string] redis authentication user, you need it when you connect to an encrypted cluster ### auth [string] Redis authentication password, you need it when you connect to an encrypted cluster ### db_num [int] Redis database index ID. It is connected to db 0 by default ### mode [string] redis mode, `single` or `cluster`, default is `single` ### nodes [list] redis nodes information, used in cluster mode, must like as the following format: ["host1:port1", "host2:port2"] ### format [string] The format of upstream data, currently support `json`, `text` format, default `json`. When you assign format is `json`, for example: Upstream data is the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | Connector will generate data as the following and write it to redis: ```json {"code": 200, "data": "get success", "success": "true"} ``` when you assign format is `text`, and set field_delimiter to `#`, connector will generate data as the following and write it to redis: ```text 200#get success#true ``` ### field_delimiter [string] Field delimiter, used to tell connector how to slice and dice fields. Currently, only need to be configured when format is `text`. default is ",". ### expire [long] Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default. ### support_custom_key [boolean] if true, the key can be customized by the field value in the upstream data. Upstream data is the following: | code | data | success | |------|----------------|---------| | 200 | get success | true | | 500 | internal error | false | You can customize the Redis key using '{' and '}', and the field name in '{}' will be parsed and replaced by the field value in the upstream data. For example, If you assign field name to `{code}` and data_type to `key`, two data will be written to redis: 1. `200 -> {code: 200, data: get success, success: true}` 2. `500 -> {code: 500, data: internal error, success: false}` Redis key can be composed of fixed and variable parts, connected by ':'. For example, If you assign field name to `code:{code}` and data_type to `key`, two data will be written to redis: 1. `code:200 -> {code: 200, data: get success, success: true}` 2. `code:500 -> {code: 500, data: internal error, success: false}` ### value_field [string] The field of value you want to write to redis, `data_type` support `key` `list` `set` `zset`. When you assign field name to `value` and value_field is `data` and data_type to `key`, for example: Upstream data is the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | The following data will be written to redis: 1. `value -> get success` ### hash_key_field [string] The field of hash key you want to write to redis, `data_type` support `hash` ### hash_value_field [string] The field of hash value you want to write to redis, `data_type` support `hash` When you assign field name to `value` and hash_key_field is `data` and hash_value_field is `success` and data_type to `hash`, for example: Upstream data is the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | Connector will generate data as the following and write it to redis: The following data will be written to redis: 1. `value -> get success | true` ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example simple: ```hocon Redis { host = localhost port = 6379 key = age data_type = list } ``` custom key: ```hocon Redis { host = localhost port = 6379 key = "name:${name}" support_custom_key = true data_type = key } ``` custom value: ```hocon Redis { host = localhost port = 6379 key = person value_field = "name" data_type = key } ``` custom HashKey and HashValue: ```hocon Redis { host = localhost port = 6379 key = person hash_key_field = "name" hash_value_field = "age" data_type = hash } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Redshift.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Redshift > JDBC Redshift sink Connector ## Support those engines > Spark
> Flink
> Seatunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Supported DataSource list | datasource | supported versions | driver | url | maven | |------------|----------------------------------------------------------|---------------------------------|-----------------------------------------|------------------------------------------------------------------------------------| | redshift | Different dependency version has different driver class. | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [Download](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | ## Database dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Data Type Mapping | SeaTunnel Data type | Redshift Data type | |-------------------------|--------------------| | BOOLEAN | BOOLEAN | | TINYINT
SMALLINT | SMALLINT | | INT | INTEGER | | BIGINT | BIGINT | | FLOAT | REAL | | DOUBLE | DOUBLE PRECISION | | DECIMAL | NUMERIC | | STRING(<=65535) | CHARACTER VARYING | | STRING(>65535) | SUPER | | BYTES | BINARY VARYING | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | MAP
ARRAY
ROW | SUPER | ## Task Example ### Simple ``` sink { jdbc { url = "jdbc:redshift://localhost:5439/mydatabase" driver = "com.amazon.redshift.jdbc.Driver" username = "myUser" password = "myPassword" generate_sink_sql = true schema = "public" table = "sink_table" } } ``` ### CDC(Change data capture) event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { url = "jdbc:redshift://localhost:5439/mydatabase" driver = "com.amazon.redshift.jdbc.Driver" username = "myUser" password = "mypassword" generate_sink_sql = true schema = "public" table = "sink_table" # config update/delete primary keys primary_keys = ["id","name"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/RocketMQ.md ================================================ import ChangeLog from '../changelog/connector-rocketmq.md'; # RocketMQ > RocketMQ sink connector ## Support Apache RocketMQ Version - 4.9.0 (Or a newer version, for reference) ## Support These Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we will use 2pc to guarantee the message is sent to RocketMQ exactly once. ## Description Write Rows to a Apache RocketMQ topic. ## Sink Options | Name | Type | Required | Default | Description | |----------------------|---------|----------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | string | yes | - | `RocketMQ topic` name. | | name.srv.addr | string | yes | - | `RocketMQ` name server cluster address. | | acl.enabled | Boolean | no | false | false | | access.key | String | no | | When ACL_ENABLED is true, access key cannot be empty | | secret.key | String | no | | When ACL_ENABLED is true, secret key cannot be empty | | producer.group | String | no | SeaTunnel-producer-Group | SeaTunnel-producer-Group | | tag | String | no | - | `RocketMQ` message tag. | | partition.key.fields | array | no | - | - | | format | String | no | json | Data format. The default format is json. Optional text format. The default field separator is ",".If you customize the delimiter, add the "field_delimiter" option. | | field.delimiter | String | no | , | Customize the field delimiter for data format. | | producer.send.sync | Boolean | no | false | If true, the message will be sync sent. | | common-options | config | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. | ### partition.key.fields [array] Configure which fields are used as the key of the RocketMQ message. For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. Upstream data is the following: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. ## Task Example ### Fake to Rocketmq Simple > The data is randomly generated and asynchronously sent to the test topic ```hocon env { parallelism = 1 } source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic" } } ``` ### Rocketmq To Rocketmq Simple > Consuming Rocketmq writes to c_int field Hash number of partitions written to different partitions This is the default asynchronous way to write ```hocon env { parallelism = 1 } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic_sink" partition.key.fields = ["c_int"] } } ``` ### Timestamp consumption write Simple > This is a stream consumption specified time stamp consumption, when there are new partitions added the program will refresh the perception and consumption at intervals, and write to another topic type ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" start.mode = "CONSUME_FROM_FIRST_OFFSET" batch.size = "400" consumer.group = "test_topic_group" format = "json" format = json schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic" partition.key.fields = ["c_int"] producer.send.sync = true } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/S3-Redshift.md ================================================ import ChangeLog from '../changelog/connector-s3-redshift.md'; # S3Redshift > The way of S3Redshift is to write data into S3, and then use Redshift's COPY command to import data from S3 to Redshift. ## Description Output data to AWS Redshift. > Tips: > We based on the [S3File](S3File.md) to implement this connector. So you can use the same configuration as S3File. > We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to S3 and this connector need some hadoop dependencies. > It's only support hadoop version **2.6.5+**. ## Key features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json ## Options | name | type | required | default value | |----------------------------------|---------|----------|-----------------------------------------------------------| | jdbc_url | string | yes | - | | jdbc_user | string | yes | - | | jdbc_password | string | yes | - | | execute_sql | string | yes | - | | path | string | yes | - | | bucket | string | yes | - | | access_key | string | no | - | | access_secret | string | no | - | | hadoop_s3_properties | map | no | - | | file_name_expression | string | no | "${transactionId}" | | file_format_type | string | no | "text" | | filename_time_format | string | no | "yyyy.MM.dd" | | field_delimiter | string | no | '\001' | | row_delimiter | string | no | "\n" | | partition_by | array | no | - | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | | is_partition_field_write_in_file | boolean | no | false | | sink_columns | array | no | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | batch_size | int | no | 1000000 | | common-options | | no | - | ### jdbc_url The JDBC URL to connect to the Redshift database. ### jdbc_user The JDBC user to connect to the Redshift database. ### jdbc_password The JDBC password to connect to the Redshift database. ### execute_sql The SQL to execute after the data is written to S3. eg: ```sql COPY target_table FROM 's3://yourbucket${path}' IAM_ROLE 'arn:XXX' REGION 'your region' format as json 'auto'; ``` `target_table` is the table name in Redshift. `${path}` is the path of the file written to S3. please confirm your sql include this variable. and don't need replace it. we will replace it when execute sql. IAM_ROLE is the role that has permission to access S3. format is the format of the file written to S3. please confirm this format is same as the file format you set in the configuration. please refer to [Redshift COPY](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) for more details. please confirm that the role has permission to access S3. ### path [string] The target dir path is required. ### bucket [string] The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. ### access_key [string] The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ### access_secret [string] The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ### hadoop_s3_properties [map] If you need to add a other option, you could add it here and refer to this [Hadoop-AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ``` hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } ``` ### file_name_expression [string] `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### filename_time_format [string] When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | See [Java SimpleDateFormat](https://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html) for detailed time format syntax. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text` and `csv` file format. ### partition_by [array] Partition data based on selected fields ### partition_dir_expression [string] If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be written into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example For text file format ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' removequotes emptyasnull blanksasnull maxerror 100 delimiter '|' ;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "text" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` For parquet file format ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as PARQUET;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/parquet" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "parquet" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` For orc file format ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as ORC;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/orc" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "orc" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/S3File.md ================================================ import ChangeLog from '../changelog/connector-file-s3.md'; # S3File > S3 File Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Description Output data to aws s3 file system. ## Supported DataSource Info | Datasource | Supported Versions | |------------|--------------------| | S3 | current | ## Database Dependency > If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. > > If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under `${SEATUNNEL_HOME}/lib` to confirm this. > To use this connector you need put `hadoop-aws-3.1.4.jar` and `aws-java-sdk-bundle-1.12.692.jar` in `${SEATUNNEL_HOME}/lib` dir. ## Data Type Mapping If write to `csv`, `text` file type, All column will be string. ### Orc File Type | SeaTunnel Data type | Orc Data type | |----------------------|-----------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | BYTE | | SMALLINT | SHORT | | INT | INT | | BIGINT | LONG | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
TIMESTAMP | TIMESTAMP | | ROW | STRUCT | | NULL | UNSUPPORTED DATA TYPE | | ARRAY | LIST | | Map | Map | ### Parquet File Type | SeaTunnel Data type | Parquet Data type | |----------------------|-----------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | INT_8 | | SMALLINT | INT_16 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
TIMESTAMP | TIMESTAMP_MILLIS | | ROW | GroupType | | NULL | UNSUPPORTED DATA TYPE | | ARRAY | LIST | | Map | Map | ## Sink Options | name | type | required | default value | Description | |---------------------------------------|---------|----------|-------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | | | tmp_path | string | no | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a S3 dir. | | bucket | string | yes | - | | | fs.s3a.endpoint | string | yes | - | | | fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. | | access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | | secret_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used when have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used when have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used when have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml, specifies the tag name of the root element within the XML file. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml, specifies the tag name of the data rows within the XML file | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml, specifies Whether to process data using the tag attribute format. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before turning on the synchronous task, do different treatment of the target path | | data_save_mode | Enum | no | APPEND_DATA | Before opening the synchronous task, the data file in the target path is differently processed | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### path [string] Store the path of the data file to support variable replacement. For example: path=/test/${database_name}/${schema_name}/${table_name} ### hadoop_s3_properties [map] If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ``` hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } ``` ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory [int] When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows [int] When file format is Excel, the maximum number of rows per sheet. ### sheet_name [string] Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### schema_save_mode [Enum] Before turning on the synchronous task, do different treatment of the target path. Option introduction: `RECREATE_SCHEMA` :Will be created when the path does not exist. If the path already exists, delete the path and recreate it. `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the path does not exist, use the path when the path is existed. `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the path does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before opening the synchronous task, the data file in the target path is differently processed. Option introduction: `DROP_DATA`: use the path but delete data files in the path. `APPEND_DATA`:use the path, and add new files in the path for write data. `ERROR_WHEN_DATA_EXISTS`:When there are some data files in the path, an error will is reported. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to S3File Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target s3 dir will also create a file and all of the data in write in it. > Before run this job, you need create s3 path: /seatunnel/text. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_map = "map>" c_array = "array" name = string c_boolean = boolean age = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction=true hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` For text file format with `have_partition` and `custom_filename` and `sink_columns` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction=true hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } ``` For parquet file format simple config with `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/parquet" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "parquet" hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } ``` For orc file format simple config with `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/orc" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } ``` Multi-table writing and saveMode ```hocon env { "job.name"="SeaTunnel_job" "job.mode"=STREAMING } source { MySQL-CDC { database-names=[ "wls_t1" ] table-names=[ "wls_t1.mysqlcdc_to_s3_t3", "wls_t1.mysqlcdc_to_s3_t4", "wls_t1.mysqlcdc_to_s3_t5", "wls_t1.mysqlcdc_to_s3_t1", "wls_t1.mysqlcdc_to_s3_t2" ] password="xxxxxx" username="xxxxxxxxxxxxx" url="jdbc:mysql://localhost:3306/qa_source" } } transform { } sink { S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel/${table_name}" path="/test/${table_name}" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ## Changelog ================================================ FILE: docs/en/connectors/sink/SelectDB-Cloud.md ================================================ import ChangeLog from '../changelog/connector-selectdb-cloud.md'; # SelectDB Cloud > SelectDB Cloud sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to SelectDB Cloud. Both support streaming and batch mode. The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table. ## Supported DataSource Info :::tip Version Supported * supported `SelectDB Cloud version is >= 2.2.x` ::: ## Sink Options | Name | Type | Required | Default | Description | |--------------------|--------|----------|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | load-url | String | Yes | - | `SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port` | | jdbc-url | String | Yes | - | `SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port` | | cluster-name | String | Yes | - | `SelectDB Cloud` cluster name | | username | String | Yes | - | `SelectDB Cloud` user username | | password | String | Yes | - | `SelectDB Cloud` user password | | sink.enable-2pc | bool | No | true | Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. SelectDB uses cache files to load data. When the amount of data is large, cached data may become invalid (the default expiration time is 1 hour). If you encounter a large amount of data write loss, please configure sink.enable-2pc to false. | | table.identifier | String | Yes | - | The name of `SelectDB Cloud` table, the format is `database.table` | | sink.enable-delete | bool | No | false | Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model. | | sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | | sink.buffer-size | int | No | 10 * 1024 * 1024 (1MB) | the buffer size to cache data for stream load. | | sink.buffer-count | int | No | 10000 | the buffer count to cache data for stream load. | | selectdb.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | ## Data Type Mapping | SelectDB Cloud Data type | SeaTunnel Data type | |--------------------------|-----------------------------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT
TINYINT | | INT | INT
SMALLINT
TINYINT | | BIGINT | BIGINT
INT
SMALLINT
TINYINT | | LARGEINT | BIGINT
INT
SMALLINT
TINYINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE
FLOAT | | DECIMAL | DECIMAL
DOUBLE
FLOAT | | DATE | DATE | | DATETIME | TIMESTAMP | | CHAR | STRING | | VARCHAR | STRING | | STRING | STRING | | ARRAY | ARRAY | | MAP | MAP | | JSON | STRING | | HLL | Not supported yet | | BITMAP | Not supported yet | | QUANTILE_STATE | Not supported yet | | STRUCT | Not supported yet | #### Supported import data formats The supported formats include CSV and JSON ## Task Example ### Simple > The following example describes writing multiple data types to SelectDBCloud, and users need to create corresponding tables downstream ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "json" } } } ``` ### Use JSON format to import data ``` sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "json" } } } ``` ### Use CSV format to import data ``` sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "csv" file.column_separator = "," file.line_delimiter = "\n" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/SensorsData.md ================================================ import ChangeLog from '../changelog/connector-sensorsdata.md'; # SensorsData > SensorsData sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description A sink plugin which use SensorsData SDK send data records. ## Sink Options | name | type | required | default value | |---------------------------|---------|----------|---------------| | server_url | string | yes | - | | bulk_size | int | no | 50 | | max_cache_row_size | int | no | 0 | | consumer | string | no | batch | | entity_name | string | yes | users | | record_type | string | yes | users | | schema | string | yes | users | | distinct_id_column | string | yes | - | | identity_fields | array | yes | - | | property_fields | array | yes | - | | event_name | string | yes | - | | time_column | string | yes | - | | time_free | boolean | no | false | | detail_id_column | string | no | - | | item_id_column | string | no | - | | item_type_column | string | no | - | | skip_error_record | boolean | no | false | | instant_events | array | no | - | | distinct_id_by_identities | boolean | no | false | | null_as_profile_unset | boolean | no | false | | common-options | | no | - | ## Parameter Interpretation ### server_url [string] SensorsData data sink address, the format is `https://${host}:8106/sa?project=${project}` ### bulk_size [int] Threshold for the triggering flush operation in SensorsData SDK. When the memory cache queue reaches this value, the data in the cache will be sent. The default value is 50. ### max_cache_row_size [int] Maximum cache refresh size for SensorsData SDK. If it exceeds this value, the flush operation will be triggered immediately. The default value is 0, which depends on bulkSize. ### consumer [string] When consumer is set to "console", the data will be output to console instead of send to the server. ### entity_name [string] The entity name of the SensorsData entity data model to receive the data records. ### record_type [string] The record type of the SensorsData entity data model. ### schema [string] The schema name of the SensorsData entity data model. ### distinct_id_column [string] The distinct id column of the user entity. ### identity_fields [array] The identity fields of the user entity. ### property_fields [array] The property fields of the data record. Dupported types: - BOOLEAN - DECIMAL - INT - BIGINT - FLOAT - DOUBLE - NUMBER - STRING - DATE - TIMESTAMP - LIST - LIST_COMMA - LIST_SEMICOLON ### event_name [string] Currently, two formats are supported: 1. Fill in the name of the event record. 2. Use value of a field from upstream data as the event name, the format is `${your field name}`, where event name is the value of the columns of the upstream data. For example, Upstream data is the following: | name | prop1 | prop2 | |----------|-------|---------------| | Purchase | 16 | data-example1 | | Order | 23 | data-example2 | If `${name}` is set as the event name, the event name of the first row is "Purchase", and the event name of the second row is "Order". ### time_column [string] The time column of the event record. ### time_free [boolean] Enable historical data mode. ### detail_id_column [string] The detail id column of the user entity. ### item_id_column [string] The item id column of the item entity. ### item_type_column [string] The item type column of the item entity. ### skip_error_record [boolean] Whether ignore the error in translating the data record. ### instant_events [array] Given a list of event names, mark the event as an instant event. ### distinct_id_by_identities [boolean] When enabled, this option automatically fills the distinct_id using the values from identity_fields columns when the distinct_id_column value is null. This ensures that SensorsData receives a non-null distinct_id value as required. ### null_as_profile_unset [boolean] When enabled, null values in profile properties will be converted to profile unset operations, effectively removing the existing value from the profile. ### common options Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details ## Examples ### Basic Event Tracking ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = events schema = events event_name = "$AppStart" time_column = col_date distinct_id_column = col_id identity_fields = [ { source = col_id, target = "$identity_login_id" } { source = col_id, target = "$identity_distinct_id" } ] property_fields = [ { target = prop1, source = col1, type = INT } { target = prop2, source = col2, type = BIGINT } { target = prop3, source = col3, type = STRING } { target = prop4, source = col4, type = BOOLEAN } ] skip_error_record = true } } ``` ### Dynamic Event Names ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = events schema = events event_name = "${event_type}" # Use dynamic event name from data time_column = event_timestamp distinct_id_column = user_id identity_fields = [ { source = user_id, target = "$identity_login_id" } { source = user_id, target = "$identity_distinct_id" } ] property_fields = [ { target = "price", source = amount, type = DECIMAL } { target = "category", source = product_category, type = STRING } { target = "device", source = device_type, type = STRING } ] instant_events = ["$AppStart", "$AppEnd"] # Mark specific events as instant } } ``` ### Profile Property Updates ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true entity_name = users record_type = profile schema = users distinct_id_column = user_id identity_fields = [ { source = email, target = "$identity_email" } { source = phone, target = "$identity_phone" } ] property_fields = [ { target = "name", source = full_name, type = STRING } { target = "age", source = user_age, type = INT } { target = "gender", source = user_gender, type = STRING } { target = "location", source = user_location, type = STRING } ] null_as_profile_unset = true # Remove properties when null } } ``` ### Item Tracking ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = items schema = items event_name = "$ItemViewed" time_column = view_time distinct_id_column = user_id identity_fields = [ { source = user_id, target = "$identity_login_id" } ] property_fields = [ { target = "view_duration", source = duration, type = INT } { target = "referrer", source = referrer_url, type = STRING } ] item_id_column = product_id item_type_column = product_type } } ``` ### Console Output (for Testing) ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" consumer = "console" # Output to console instead of sending to server record_type = events schema = events event_name = "$TestEvent" time_column = timestamp distinct_id_column = test_id property_fields = [ { target = "test", source = test_field, type = STRING } ] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Sentry.md ================================================ import ChangeLog from '../changelog/connector-sentry.md'; # Sentry ## Description Write message to Sentry. ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | dsn | string | yes | - | | env | string | no | - | | release | string | no | - | | cacheDirPath | string | no | - | | enableExternalConfiguration | boolean | no | - | | maxCacheItems | number | no | - | | flushTimeoutMills | number | no | - | | maxQueueSize | number | no | - | | common-options | | no | - | ### dsn [string] The DSN tells the SDK where to send the events to. ### env [string] specify the environment ### release [string] specify the release ### cacheDirPath [string] the cache dir path for caching offline events ### enableExternalConfiguration [boolean] if loading properties from external sources is enabled. ### maxCacheItems [number] The max cache items for capping the number of events Default is 30 ### flushTimeoutMillis [number] Controls how many seconds to wait before flushing down. Sentry SDKs cache events from a background queue and this queue is given a certain amount to drain pending events Default is 15000 = 15s ### maxQueueSize [number] Max queue size before flushing events/envelopes to the disk ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details ## Example ``` Sentry { dsn = "https://xxx@sentry.xxx.com:9999/6" enableExternalConfiguration = true maxCacheItems = 1000 env = prod } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/SftpFile.md ================================================ import ChangeLog from '../changelog/connector-file-sftp.md'; # SftpFile > Sftp file sink connector ## Description Output data to Sftp . :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ::: ## Key features - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## Options | name | type | required | default value | remarks | |---------------------------------------|---------|----------|--------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | string | yes | - | | | port | int | yes | - | | | user | string | yes | - | | | password | string | yes | - | | | path | string | yes | - | | | tmp_path | string | yes | /tmp/seatunnel | The result file will write to a tmp path first and then use `mv` to submit tmp dir to target dir. Need a FTP dir. | | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | | filename_extension | string | no | - | Override the default file name extensions with custom file name extensions. E.g. `.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | no | '\001' for text and ',' for csv | Only used when file_format_type is text and csv | | row_delimiter | string | no | "\n" | Only used when file_format_type is `text`, `csv` and `json` | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | | is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | | sink_columns | array | no | | When this parameter is empty, all fields are sink columns | | is_enable_transaction | boolean | no | true | | | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | | max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | | sheet_max_rows | int | no | 1048576 | Only used when file_format_type is excel. | | sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | | csv_string_quote_mode | enum | no | MINIMAL | Only used when file_format is csv. | | xml_root_tag | string | no | RECORDS | Only used when file_format is xml. | | xml_row_tag | string | no | RECORD | Only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Only used when file_format is xml. | | single_file_mode | boolean | no | false | Each parallelism will only output one file. When this parameter is turned on, batch_size will not take effect. The output file name does not have a file block suffix. | | create_empty_file_when_no_data | boolean | no | false | When there is no data synchronization upstream, the corresponding data files are still generated. | | parquet_avro_write_timestamp_as_int96 | boolean | no | false | Only used when file_format is parquet. | | enable_header_write | boolean | no | false | Only used when file_format_type is text,csv.
false:don't write header,true:write header. | | parquet_avro_write_fixed_as_int96 | array | no | - | Only used when file_format is parquet. | | encoding | string | no | "UTF-8" | Only used when file_format_type is json,text,csv,xml. | | schema_save_mode | string | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Existing dir processing method | | data_save_mode | string | no | APPEND_DATA | Existing data processing method | | merge_update_event | boolean | no | false | Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data | ### host [string] The target sftp host is required ### port [int] The target sftp port is required ### user [string] The target sftp user is required ### password [string] The target sftp password is required ### path [string] The target dir path is required. ### custom_filename [boolean] Whether custom the filename ### file_name_expression [string] Only used when `custom_filename` is `true` `file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, `${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. ### filename_time_format [string] Only used when `custom_filename` is `true` When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] The separator between columns in a row of data. Only needed by `text` and `csv` file format. ### row_delimiter [string] The separator between rows in a file. Only needed by `text`, `csv` and `json` file format. ### have_partition [boolean] Whether you need processing partitions. ### partition_by [array] Only used when `have_partition` is `true`. Partition data based on selected fields. ### partition_dir_expression [string] Only used when `have_partition` is `true`. If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. ### is_partition_field_write_in_file [boolean] Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. For example, if you want to write a Hive Data File, Its value should be `false`. ### sink_columns [array] Which columns need be wrote to file, default value is all the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. ### is_enable_transaction [boolean] If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. Only support `true` now. ### batch_size [int] The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel type does not support any compression format ### common options Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ### max_rows_in_memory When File Format is Excel,The maximum number of data items that can be cached in the memory. ### sheet_max_rows When file format is Excel, the maximum number of rows per sheet. ### sheet_name Writer the sheet of the workbook ### csv_string_quote_mode [string] When File Format is CSV,The string quote mode of CSV. - ALL: All String fields will be quoted. - MINIMAL: Quotes fields which contain special characters such as a the field delimiter, quote character or any of the characters in the line separator string. - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. ### xml_root_tag [string] Specifies the tag name of the root element within the XML file. ### xml_row_tag [string] Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Specifies Whether to process data using the tag attribute format. ### parquet_avro_write_timestamp_as_int96 [boolean] Support writing Parquet INT96 from a timestamp, only valid for parquet files. ### parquet_avro_write_fixed_as_int96 [array] Support writing Parquet INT96 from a 12-byte field, only valid for parquet files. ### enable_header_write [boolean] Only used when file_format_type is text,csv.false:don't write header,true:write header. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to write. This param will be parsed by `Charset.forName(encoding)`. ### schema_save_mode [string] Existing dir processing method. - RECREATE_SCHEMA: will create when the dir does not exist, delete and recreate when the dir is exist - CREATE_SCHEMA_WHEN_NOT_EXIST: will create when the dir does not exist, skipped when the dir is exist - ERROR_WHEN_SCHEMA_NOT_EXIST: error will be reported when the dir does not exist - IGNORE :Ignore the treatment of the table ### data_save_mode [string] Existing data processing method. - DROP_DATA: preserve dir and delete data files - APPEND_DATA: preserve dir, preserve data files - ERROR_WHEN_DATA_EXISTS: when there is data files, an error is reported ### merge_update_event [boolean] Only used when file_format_type is canal_json,debezium_json or maxwell_json. When value is true, the UPDATE_AFTER and UPDATE_BEFORE event will be merged into UPDATE event data ## Example For text file format with `have_partition` and `custom_filename` and `sink_columns` ```bash SftpFile { host = "xxx.xxx.xxx.xxx" port = 22 user = "username" password = "password" path = "/data/sftp/seatunnel/job1" tmp_path = "/data/sftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` When our source end is multiple tables, and wants different expressions to different directory, we can configure this way ```hocon SftpFile { host = "xxx.xxx.xxx.xxx" port = 22 user = "username" password = "password" path = "/data/sftp/seatunnel/job1/${table_name}" tmp_path = "/data/sftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true schema_save_mode=RECREATE_SCHEMA data_save_mode=DROP_DATA } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Slack.md ================================================ import ChangeLog from '../changelog/connector-slack.md'; # Slack > Slack sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to Slack Channel. Both support streaming and batch mode. > For example, if the data from upstream is [`age: 12, name: huan`], the content send to socket server is the following: `{"name":"huan","age":17}` ## Data Type Mapping All data types are mapped to string. ## Options | Name | Type | Required | Default | Description | |----------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | webhooks_url | String | Yes | - | Slack webhook url | | oauth_token | String | Yes | - | Slack oauth token used for the actual authentication | | slack_channel | String | Yes | - | slack channel for data write | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | ## Task Example ### Simple ```hocon sink { SlackSink { webhooks_url = "https://hooks.slack.com/services/xxxxxxxxxxxx/xxxxxxxxxxxx/xxxxxxxxxxxxxxxx" oauth_token = "xoxp-xxxxxxxxxx-xxxxxxxx-xxxxxxxxx-xxxxxxxxxxx" slack_channel = "channel name" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Sls.md ================================================ import ChangeLog from '../changelog/connector-sls.md'; # Sls > Sls sink connector ## Support Those Engines > Spark
> Flink
> Seatunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Sink connector for Aliyun Sls. ## Supported DataSource Info In order to use the Sls connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Maven | |------------|--------------------|-----------------------------------------------------------------------------------| | Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-sls) | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------|---------|----------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| | project | String | Yes | - | [Aliyun Sls Project](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | | logstore | String | Yes | - | [Aliyun Sls Logstore](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | | endpoint | String | Yes | - | [Aliyun Access Endpoint](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | | access_key_id | String | Yes | - | [Aliyun AccessKey ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | access_key_secret | String | Yes | - | [Aliyun AccessKey Secret](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | source | String | No | SeaTunnel-Source | Data Source marking in sls | | topic | String | No | SeaTunnel-Topic | Data topic marking in sls | ## Task Example ### Simple > This example write data to the sls's logstore1.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. [Create RAM user and authorization](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4),Please ensure thr ram user have sufficient rights to perform, reference [RAM Custom Authorization Example](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 30000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields = { id = "int" name = "string" description = "string" weight = "string" } } } } sink { Sls { endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" project = "project1" logstore = "logstore1" access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Snowflake.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Snowflake > JDBC Snowflake Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing. ## Supported DataSource list | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|-------------------------------------------|------------------------------------------------------------|-----------------------------------------------------------------------------| | snowflake | Different dependency version has different driver class. | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [Download](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | ## Database dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example Snowflake datasource: cp snowflake-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Snowflake Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | TINYINT
SMALLINT
BYTEINT
| SHORT_TYPE | | INT
INTEGER
| INT | | BIGINT | LONG | | DECIMAL
NUMERIC
NUMBER
| DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | REAL
FLOAT4 | FLOAT | | DOUBLE
DOUBLE PRECISION
FLOAT8
FLOAT
| DOUBLE | | CHAR
CHARACTER
VARCHAR
STRING
TEXT
VARIANT
OBJECT | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
TIMESTAMP
TIMESTAMP_LTZ
TIMESTAMP_NTZ
TIMESTAMP_TZ | TIMESTAMP | | BINARY
VARBINARY
GEOGRAPHY
GEOMETRY | BYTES | ## Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ## tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. > ## Task Example ### simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your snowflake database. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### CDC(Change data capture) event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` sink { jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" username = "root" password = "123456" generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Socket.md ================================================ import ChangeLog from '../changelog/connector-socket.md'; # Socket > Socket sink connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to Socket Server. Both support streaming and batch mode. > For example, if the data from upstream is [`age: 12, name: jared`], the content send to socket server is the following: `{"name":"jared","age":17}` ## Sink Options | Name | Type | Required | Default | Description | |----------------|---------|----------|---------|-----------------------------------------------------------------------------------------------------------------| | host | String | Yes | | socket server host | | port | Integer | Yes | | socket server port | | max_retries | Integer | No | 3 | The number of retries to send record failed | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/sink-common-options.md) for details | ## Task Example > This is randomly generated data written to the Socket side ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake" schema = { fields { name = "string" age = "int" } } } } sink { Socket { host = "localhost" port = 9999 } } ``` * Start a port listening ```shell nc -l -v 9999 ``` * Start a SeaTunnel task * Socket Server Console print data ```text {"name":"jared","age":17} ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/SqlServer.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # SQL Server > JDBC SQL Server Sink Connector ## Support SQL Server Version - server:2008 (Or later version for information only) ## Support Those engines > Spark
> Flink
> Seatunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|-------------------------|----------------------------------------------|---------------------------------|-----------------------------------------------------------------------------------| | SQL Server | support version >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [Download](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | ## Database dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example SQL Server datasource: cp mssql-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | SQLserver Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| | BIT | BOOLEAN | | TINYINT
SMALLINT | SHORT | | INTEGER | INT | | BIGINT | LONG | | DECIMAL
NUMERIC
MONEY
SMALLMONEY | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the
decimal point.))) | | REAL | FLOAT | | FLOAT | DOUBLE | | CHAR
NCHAR
VARCHAR
NTEXT
NVARCHAR
TEXT | STRING | | DATE | LOCAL_DATE | | TIME | LOCAL_TIME | | DATETIME
DATETIME2
SMALLDATETIME
DATETIMEOFFSET | LOCAL_DATE_TIME | | TIMESTAMP
BINARY
VARBINARY
IMAGE
UNKNOWN | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:sqlserver://localhost:1433;databaseName=mydatabase | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use sqlServer the value is `com.microsoft.sqlserver.jdbc.SQLServerDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, SqlServer is `com.microsoft.sqlserver.jdbc.SQLServerXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ## tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### simple > This is one that reads Sqlserver data and inserts it directly into another table ``` env { # You can set engine configuration here parallelism = 10 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from column_type_test.dbo.full_types_jdbc" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source/Jdbc } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc } ``` ### CDC(Change data capture) event > CDC change data is also supported by us In this case, you need config database, table and primary_keys. ``` Jdbc { plugin_input = "customers" driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" generate_sink_sql = true database = "column_type_test" table = "dbo.full_types_sink" batch_size = 100 primary_keys = ["id"] } ``` ### Exactly Once Sink > Transactional writes may be slower but more accurate to the data ``` Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" is_exactly_once = "true" xa_data_source_class_name = "com.microsoft.sqlserver.jdbc.SQLServerXADataSource" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/StarRocks.md ================================================ import ChangeLog from '../changelog/connector-starrocks.md'; # StarRocks > StarRocks sink connector ## Support These Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## Description Used to send data to StarRocks. Both support streaming and batch mode. The internal implementation of StarRocks sink connector is cached and imported by stream load in batches. ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Sink Options | Name | Type | Required | Default | Description | |-----------------------------|---------|----------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | nodeUrls | list | yes | - | `StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` | | base-url | string | yes | - | The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db` | | username | string | yes | - | `StarRocks` user username | | password | string | yes | - | `StarRocks` user password | | database | string | yes | - | The name of StarRocks database | | table | string | no | - | The name of StarRocks table, If not set, the table name will be the name of the upstream table | | labelPrefix | string | no | - | The prefix of StarRocks stream load label | | batch_max_rows | long | no | 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks | | batch_max_bytes | int | no | 5 * 1024 * 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks | | max_retries | int | no | - | The number of retries to flush failed | | retry_backoff_multiplier_ms | int | no | - | Using as a multiplier for generating the next delay for backoff | | max_retry_backoff_ms | int | no | - | The amount of time to wait before attempting to retry a request to `StarRocks` | | enable_upsert_delete | boolean | no | false | Whether to enable upsert/delete, only supports PrimaryKey model. | | save_mode_create_template | string | no | see below | see below | | starrocks.config | map | no | - | The parameter of the stream load `data_desc` | | http_socket_timeout_ms | int | no | 180000 | Set http socket timeout, default is 3 minutes. | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. | | data_save_mode | Enum | no | APPEND_DATA | Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. | | custom_sql | String | no | - | When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. | ### save_mode_create_template We use templates to automatically create starrocks tables, which will create corresponding table creation statements based on the type of upstream data and schema type, and the default template can be modified according to the situation. Only work on multi-table mode at now. Default template: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP PRIMARY KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key})PROPERTIES ( "replication_num" = "1" ) ``` If a custom field is filled in the template, such as adding an `id` field ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( id, ${rowtype_fields} ) ENGINE = OLAP COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1" ); ``` The connector will automatically obtain the corresponding type from the upstream to complete the filling, and remove the id field from `rowtype_fields`. This method can be used to customize the modification of field types and attributes. You can use the following placeholders - database: Used to get the database in the upstream schema - table_name: Used to get the table name in the upstream schema - rowtype_fields: Used to get all the fields in the upstream schema, we will automatically map to the field description of StarRocks - rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) - rowtype_unique_key: Used to get the unique key in the upstream schema (maybe a list) - comment: Used to get the table comment in the upstream schema ### table [string] Use `database` and this `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. The table parameter can fill in the name of an unwilling table, which will eventually be used as the table name of the creation table, and supports variables (`${table_name}`, `${schema_name}`). Replacement rules: `${schema_name}` will replace the SCHEMA name passed to the target side, and `${table_name}` will replace the name of the table passed to the table at the target side. for example: 1. test_${schema_name}_${table_name}_test 2. sink_sinktable 3. ss_${table_name} ### schema_save_mode [Enum] Before the synchronous task is turned on, different treatment schemes are selected for the existing surface structure of the target side. Option introduction: `RECREATE_SCHEMA` :Will create when the table does not exist, delete and rebuild when the table is saved `CREATE_SCHEMA_WHEN_NOT_EXIST` :Will Created when the table does not exist, skipped when the table is saved `ERROR_WHEN_SCHEMA_NOT_EXIST` :Error will be reported when the table does not exist `IGNORE` :Ignore the treatment of the table ### data_save_mode [Enum] Before the synchronous task is turned on, different processing schemes are selected for data existing data on the target side. Option introduction: `DROP_DATA`: Preserve database structure and delete data `APPEND_DATA`:Preserve database structure, preserve data `CUSTOM_PROCESSING`:User defined processing `ERROR_WHEN_DATA_EXISTS`:When there is data, an error is reported ### custom_sql [String] When data_save_mode selects CUSTOM_PROCESSING, you should fill in the CUSTOM_SQL parameter. This parameter usually fills in a SQL that can be executed. SQL will be executed before synchronization tasks. ## Data Type Mapping | StarRocks Data type | SeaTunnel Data type | |---------------------|---------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | DATE | STRING | | TIME | STRING | | DATETIME | STRING | | STRING | STRING | | ARRAY | STRING | | MAP | STRING | | BYTES | STRING | #### Supported import data formats The supported formats include CSV and JSON ## Task Example ### Simple > The following example describes writing multiple data types to StarRocks, and users need to create corresponding tables downstream ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "JSON" strip_outer_array = true } } } ``` ### Support write cdc changelog event(INSERT/UPDATE/DELETE) ```hocon sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] username = root password = "" database = "test" table = "e2e_table_sink" ... // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. enable_upsert_delete = true } } ``` ### Use JSON format to import data ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "JSON" strip_outer_array = true } } } ``` ### Use CSV format to import data ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "CSV" column_separator = "\\x01" row_delimiter = "\\x02" } } } ``` ### Use save_mode function ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "test_${schema_name}_${table_name}" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" batch_max_rows = 10 starrocks.config = { format = "CSV" column_separator = "\\x01" row_delimiter = "\\x02" } } } ``` ### Multiple table #### example1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { base-url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "${database_name}_test" table = "${table_name}_test" ... // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. enable_upsert_delete = true } } ``` #### example2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "${schema_name}_test" table = "${table_name}_test" ... // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. enable_upsert_delete = true } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/TDengine.md ================================================ import ChangeLog from '../changelog/connector-tdengine.md'; # TDengine > TDengine sink connector ## Description Used to write data to TDengine. You need to create stable before running seatunnel task ## Key features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------|--------|----------|---------------| | url | string | yes | - | | username | string | yes | - | | password | string | yes | - | | database | string | yes | | | stable | string | yes | - | | timezone | string | no | UTC | | write_columns| list | no | - | ### url [string] the url of the TDengine when you select the TDengine e.g. ``` jdbc:TAOS-RS://localhost:6041/ ``` ### username [string] the username of the TDengine when you select ### password [string] the password of the TDengine when you select ### database [string] the database of the TDengine when you select ### stable [string] the stable of the TDengine when you select ### timezone [string] the timeznoe of the TDengine sever, it's important to the ts field ### write_columns [list] The field names to be inserted into TDengine. If not set, all fields will be written. The plugin will automatically append TAGS columns, so please do not include TAGS columns in this option. ## Example ### sink ```hocon sink { TDengine { url : "jdbc:TAOS-RS://localhost:6041/" username : "root" password : "taosdata" database : "power2" stable : "meters2" timezone: UTC write_columns: ["ts", "voltage", "current", "power"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Tablestore.md ================================================ import ChangeLog from '../changelog/connector-tablestore.md'; # Tablestore > Tablestore sink connector ## Description Write data to `Tablestore` ## Key features - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------|--------|----------|---------------| | end_point | string | yes | - | | instance_name | string | yes | - | | access_key_id | string | yes | - | | access_key_secret | string | yes | - | | table | string | yes | - | | primary_keys | array | yes | - | | batch_size | string | no | 25 | | common-options | config | no | - | ### end_point [string] endPoint to write to Tablestore. ### instanceName [string] The instanceName of Tablestore. ### access_key_id [string] The access id of Tablestore. ### access_key_secret [string] The access secret of Tablestore. ### table [string] The table of Tablestore. ### primaryKeys [array] The primaryKeys of Tablestore. ### common options [ config ] Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details. ## Example ```bash Tablestore { end_point = "xxxx" instance_name = "xxxx" access_key_id = "xxxx" access_key_secret = "xxxx" table = "sink" primary_keys = ["pk_1","pk_2","pk_3","pk_4"] } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Typesense.md ================================================ import ChangeLog from '../changelog/connector-typesense.md'; # Typesense ## Description Outputs data to `Typesense`. ## Key Features - [ ] [Exactly Once](../../introduction/concepts/connector-v2-features.md) - [x] [CDC](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default Value | |------------------|--------|----------|------------------------------| | hosts | array | Yes | - | | collection | string | Yes | - | | schema_save_mode | string | Yes | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | string | Yes | APPEND_DATA | | primary_keys | array | No | | | key_delimiter | string | No | `_` | | api_key | string | No | | | max_retry_count | int | No | 3 | | max_batch_size | int | No | 10 | | common-options | | No | - | ### hosts [array] The access address for Typesense, formatted as `host:port`, e.g., `["typesense-01:8108"]`. ### collection [string] The name of the collection to write to, e.g., "seatunnel". ### primary_keys [array] Primary key fields used to generate the document `id`. ### key_delimiter [string] Sets the delimiter for composite keys (default is `_`). ### api_key [config] The `api_key` for secure access to Typesense. ### max_retry_count [int] The maximum number of retry attempts for batch requests. ### max_batch_size [int] The maximum size of document batches. ### common options Common parameters for Sink plugins. Refer to [Common Sink Options](../common-options/source-common-options.md) for more details. ### schema_save_mode Choose how to handle the target-side schema before starting the synchronization task: - `RECREATE_SCHEMA`: Creates the table if it doesn’t exist, and deletes and recreates it if it does. - `CREATE_SCHEMA_WHEN_NOT_EXIST`: Creates the table if it doesn’t exist, skips creation if it does. - `ERROR_WHEN_SCHEMA_NOT_EXIST`: Throws an error if the table doesn’t exist. ### data_save_mode Choose how to handle existing data on the target side before starting the synchronization task: - `DROP_DATA`: Retains the database structure but deletes the data. - `APPEND_DATA`: Retains both the database structure and the data. - `ERROR_WHEN_DATA_EXISTS`: Throws an error if data exists. ## Example Simple example: ```bash sink { Typesense { plugin_input = "typesense_test_table" hosts = ["localhost:8108"] collection = "typesense_to_typesense_sink_with_query" max_retry_count = 3 max_batch_size = 10 api_key = "xyz" primary_keys = ["num_employees","id"] key_delimiter = "=" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/sink/Vertica.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Vertica > JDBC Vertica Sink Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics (using XA transaction guarantee). ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is > support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| | Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Vertica Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
INT UNSIGNED | BOOLEAN | | TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | | INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT
FLOAT UNSIGNED | FLOAT | | DOUBLE
DOUBLE UNSIGNED | DOUBLE | | CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
TIMESTAMP | TIMESTAMP | | TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | | GEOMETRY
UNKNOWN | Not supported yet | ## Sink Options | Name | Type | Required | Default | Description | |-------------------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertical the value is `com.vertica.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | | database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | | primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | | batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, vertical is `com.vertical.cj.jdbc.VerticalXADataSource`, and
please refer to appendix for other data sources | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](../common-options/sink-common-options.md) for details | | enable_upsert | Boolean | No | true | Enable upsert by primary_keys exist, If the task has no key duplicate data, setting this parameter to `false` can speed up data import | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your vertical. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2 } sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink } ``` ### Generate Sink SQL > This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you ``` sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### Exactly-once > For accurate write scene we guarantee accurate once ``` sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.vertical.cj.jdbc.VerticalXADataSource" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Airtable.md ================================================ import ChangeLog from '../changelog/connector-http-airtable.md'; # Airtable > Airtable source connector ## Description Used to read data from Airtable. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | token | String | Yes | - | | base_id | String | Yes | - | | table | String | Yes | - | | api_base_url | String | No | https://api.airtable.com | | view | String | No | - | | fields | List | No | - | | filter_by_formula | String | No | - | | max_records | int | No | - | | page_size | int | No | - | | sort | String | No | - | | cell_format | String | No | - | | return_fields_by_field_id | boolean | No | - | | record_metadata | List | No | - | | time_zone | String | No | - | | user_locale | String | No | - | | request_interval_ms | int | No | 220 | | rate_limit_backoff_ms | int | No | 30000 | | rate_limit_max_retries | int | No | 3 | | schema | Config | No | - | | schema.fields | Config | No | - | | format | String | No | text | | content_field | String | No | - | | json_field | Config | No | - | | common-options | config | No | - | ### token [String] Airtable personal access token. You can create one at https://airtable.com/create/tokens. ### base_id [String] The ID of the Airtable base (starts with `app`). ### table [String] The table name or table ID to read from. ### api_base_url [String] Airtable API base URL. Default is `https://api.airtable.com`. ### view [String] The name or ID of a view in the table. Only records visible in this view will be returned. ### fields [List] A list of field names to include in the response. ### filter_by_formula [String] An Airtable formula to filter records. See [Airtable formula reference](https://support.airtable.com/docs/formula-field-reference). ### max_records [int] Maximum total number of records to return. ### page_size [int] Number of records per page (1-100). ### sort [String] Sort definition as a JSON array, e.g. `[{"field":"Name","direction":"asc"}]`. ### cell_format [String] The format for cell values, either `json` or `string`. ### return_fields_by_field_id [boolean] If true, field keys in the response will be field IDs instead of field names. ### record_metadata [List] Additional record metadata to return, e.g. `["commentCount"]`. ### time_zone [String] The time zone for formatting date/time values. ### user_locale [String] The user locale for formatting values. ### request_interval_ms [int] Minimum interval in milliseconds between API requests. Default 220ms (to stay within Airtable's 5 requests/second limit). ### rate_limit_backoff_ms [int] Base backoff time in milliseconds when receiving a 429 (rate limit) response. Default 30000ms. ### rate_limit_max_retries [int] Maximum number of retries after receiving a 429 response. Default 3. ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### format [String] The format of upstream data, supports `json` and `text`, default `text`. ### content_field [String] JsonPath expression to extract data from the response. For Airtable, you typically use `$.records[*].fields` to extract the fields from each record. ### json_field [Config] This parameter helps you configure the schema and must be used with schema. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Example Read from an Airtable table and output raw text: ```hocon source { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" format = "text" max_records = 10 } } ``` Read with schema and extract record fields: ```hocon source { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" content_field = "$.records[*].fields" filter_by_formula = "{Status} = 'Shipped'" schema = { fields { Name = string Status = string Weight = float } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/AmazonDynamoDB.md ================================================ import ChangeLog from '../changelog/connector-amazondynamodb.md'; # AmazonDynamoDB > AmazonDynamoDB source connector ## Description Read data from Amazon DynamoDB. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------|--------|----------|---------------| | url | string | yes | - | | region | string | yes | - | | access_key_id | string | yes | - | | secret_access_key | string | yes | - | | table | string | yes | - | | schema | config | yes | - | | common-options | | yes | - | | scan_item_limit | | false | - | | parallel_scan_threads | | false | - | ### url [string] The URL to read to Amazon Dynamodb. ### region [string] The region of Amazon Dynamodb. ### access_key_id [string] The access id of Amazon DynamoDB. ### secret_access_key [string] The access secret of Amazon DynamoDB. ### table [string] The table of Amazon DynamoDB. ### schema [Config] #### fields [config] Amazon Dynamodb is a NOSQL database service of support keys-value storage and document data structure,there is no way to get the data type.Therefore, we must configure schema. such as: ``` schema { fields { id = int key_aa = string key_bb = string } } ``` ### common options Source Plugin common parameters, refer to [Source Plugin](../common-options/source-common-options.md) for details ### scan_item_limit number of item each scan request should return ### parallel_scan_threads number of logical segments for parallel scan ## Example ```bash Amazondynamodb { url = "http://127.0.0.1:8000" region = "us-east-1" access_key_id = "dummy-key" secret_access_key = "dummy-secret" table = "TableName" schema = { fields { artist = string c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/AmazonSqs.md ================================================ import ChangeLog from '../changelog/connector-amazonsqs.md'; # AmazonSqs > AmazonSqs source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Read data from Amazon SQS. ## Source Options | Name | Type | Required | Default | Description | |-------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The Queue URL to read from Amazon SQS. | | region | String | No | - | The AWS region for the SQS service | | schema | Config | No | - | The structure of the data, including field names and field types. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | | format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | | field_delimiter | String | No | , | Customize the field delimiter for data format. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Task Example ```bash source { AmazonSqs { url = "http://127.0.0.1:4566" region = "us-east-1" format = text field_delimiter = "#" schema = { fields { artist = string c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Cassandra.md ================================================ import ChangeLog from '../changelog/connector-cassandra.md'; # Cassandra > Cassandra source connector ## Description Read data from Apache Cassandra. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------|--------|----------|---------------| | host | String | Yes | - | | keyspace | String | Yes | - | | cql | String | Yes | - | | username | String | No | - | | password | String | No | - | | datacenter | String | No | datacenter1 | | consistency_level | String | No | LOCAL_ONE | ### host [string] `Cassandra` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"cassandra1:9042,cassandra2:9042"`. ### keyspace [string] The `Cassandra` keyspace. ### cql [String] The query cql used to search data though Cassandra session. ### username [string] `Cassandra` user username. ### password [string] `Cassandra` user password. ### datacenter [String] The `Cassandra` datacenter, default is `datacenter1`. ### consistency_level [String] The `Cassandra` write consistency level, default is `LOCAL_ONE`. ## Examples ```hocon source { Cassandra { host = "localhost:9042" username = "cassandra" password = "cassandra" datacenter = "datacenter1" keyspace = "test" cql = "select * from source_table" plugin_output = "source_table" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Clickhouse.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # Clickhouse > Clickhouse source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table read](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Description Used to read data from Clickhouse. ## Supported DataSource Info In order to use the Clickhouse connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------------| | Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-clickhouse) | ## Data Type Mapping | Clickhouse Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------------------------------------------------------------------------|---------------------| | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | STRING | | Int8 / UInt8 / Int16 / UInt16 / Int32 | INT | | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | BIGINT | | Float64 | DOUBLE | | Decimal | DECIMAL | | Float32 | FLOAT | | Date | DATE | | DateTime | TIME | | Array | ARRAY | | Map | MAP | ## Source Options | Name | Type | Required | Default | Description | |-------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . | | username | String | Yes | - | `ClickHouse` user username. | | password | String | Yes | - | `ClickHouse` user password. | | table_list | Array | NO | - | The list of tables to be read. | | clickhouse.config | Map | No | - | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | | server_time_zone | String | No | ZoneId.systemDefault() | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | Table list configuration: | Name | Type | Required | Default | Description | |-------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | table_path | String | NO | - | The path to the full path of table, example: `default.table` | | sql | String | NO | - | The query sql used to search data though Clickhouse server. | | filter_query | String | NO | - | Data filtering in Clickhouse. the format is "field = value", example : filter_query = "id > 2 and type = 1" | | partition_list | Array | NO | - | Table partition list to filter the specified partition. If it is a partitioned table, this field can be configured to filter the data of the specified partition. example: partition_list = ["20250615", "20250616"] | | batch_size | int | NO | 1024 | The maximum rows of data that can be obtained by reading from Clickhouse once. | Note: When this configuration corresponds to a single table, you can flatten the configuration items in table_list to the outer layer. ## Parallel Reader The Clickhouse source connector supports parallel reading of data. For query table mode, the `table_path` parameter is set and the parallel reading is implemented based on the part file of table, which is obtained from the `system.parts` table. For sql mode, the parallel reading is implemented based on the parallelism execution of local table-based queries on each shard of the cluster. If the `sql` parameter specifies a distributed table, the corresponding local table will be automatically converted to execute the query. If the `sql` specifies a local table, the node configured by the `host` parameter will be used as the shard to perform parallelism reading. If both the `table_path` and `sql` parameters are set, it will be executed in sql mode, and the `table_path` parameter can be used to better identify the metadata of the table. ## Tips In query table mode, if you don't want to read the entire table, you can specify the `partition_list` or `filter_query` parameter. * `partition_list`: filter the data of the specified partition * `filter_query`: filter the data based on the specified conditions The `batch_size` parameter can be used to control the amount of data read each time to avoid OOM exception when reading a large amount of data. Appropriately increasing this value will help to improve the performance of the reading process. Use `table_path` to replace `sql` for single table reading. ## How to Create a Clickhouse Data Synchronization Jobs ### Single Table The following example demonstrates how to create a data synchronization job that reads data from Clickhouse and prints it on the local client: **Case 1: Parallel reading based on the part read strategy** ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_path = "default.table" server_time_zone = "UTC" partition_list = ["20250615", "20250616"] filter_query = "id > 2 and type = 1" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` **Case 2: Parallel reading based on the SQL read strategy** > Parallel execution in SQL mode currently only supports single-table and WHERE-condition queries ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_path = "default.table" server_time_zone = "UTC" sql = "select * from default.table where id > 2 and type = 1" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` **Case 3: Complex SQL with single parallelism execution** When using complex SQL queries (such as queries with join, group by, subqueries, etc.), the connector will automatically switch to single parallel execution mode, even if a higher parallelism value is configured. ```hocon env { job.mode = "BATCH" parallelism = 1 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" server_time_zone = "UTC" sql = "select t1.id, t2.category from default.table1 t1 global join default.table2 t2 on t1.id = t2.id where t1.age > 18" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` ### Multiple table ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_list = [ { table_path = "default.table1" sql = "select * from default.table1 where id > 2 and type = 1" }, { table_path = "default.table2" sql = "select * from default.table2 where age > 18" } ] server_time_zone = "UTC" clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Cloudberry.md ================================================ import ChangeLog from '../changelog/connector-cloudberry.md'; # Cloudberry > JDBC Cloudberry Source Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Description Read external data source data through JDBC. Cloudberry currently does not have its own native JDBC driver, using PostgreSQL's drivers and implementation. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|------------------------------------------|------------------------|---------------------------------------|--------------------------------------------------------------------------| | Cloudberry | Uses PostgreSQL driver implementation | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | ## Database Dependency > Please download the PostgreSQL driver jar and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping Cloudberry uses PostgreSQL's data type implementation. Please refer to PostgreSQL documentation for data type compatibility and mappings. ## Options Cloudberry connector uses the same options as PostgreSQL. For detailed configuration options, please refer to the PostgreSQL documentation. Key options include: - url (required): The JDBC connection URL - driver (required): The driver class name (org.postgresql.Driver) - user/password: Authentication credentials - query or table_path: What data to read - partition options for parallel reading ## Parallel Reader Cloudberry supports parallel reading following the same rules as PostgreSQL connector. For detailed information on split strategies and parallel reading options, please refer to the PostgreSQL connector documentation. ## Task Example ### Simple ```hocon env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "select * from mytable limit 100" } } sink { Console {} } ``` ### Parallel reading with table_path ```hocon env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" table_path = "public.mytable" split.size = 10000 } } sink { Console {} } ``` ### Multiple table read ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" "table_list" = [ { "table_path" = "public.table1" }, { "table_path" = "public.table2" } ] split.size = 10000 } } sink { Console {} } ``` For more detailed examples and configurations, please refer to the PostgreSQL connector documentation. ## Changelog ================================================ FILE: docs/en/connectors/source/CosFile.md ================================================ import ChangeLog from '../changelog/connector-file-cos.md'; # CosFile > Cos file source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from aliyun Cos file system. :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. ::: ## Options | name | type | required | default value | |----------------------------|---------|----------|-----------------------------| | path | string | yes | - | | file_format_type | string | yes | - | | bucket | string | yes | - | | secret_id | string | yes | - | | secret_key | string | yes | - | | region | string | yes | - | | read_columns | list | yes | - | | delimiter/field_delimiter | string | no | \001 for text and , for csv | | row_delimiter | string | no | \n | | parse_partition_from_path | boolean | no | true | | skip_header_row_number | long | no | 0 | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | | time_format | string | no | HH:mm:ss | | schema | config | no | - | | sheet_name | string | no | - | | xml_row_tag | string | no | - | | xml_use_attr_format | boolean | no | - | | csv_use_header_line | boolean | no | false | | file_filter_pattern | string | no | - | | filename_extension | string | no | - | | compress_codec | string | no | none | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | binary_chunk_size | int | no | 1024 | | binary_complete_file_mode | boolean | no | false | | common-options | | no | - | | file_filter_modified_start | string | no | - | | file_filter_modified_end | string | no | - | | quote_char | string | no | " | | escape_char | string | no | - | ### path [string] The source file path. ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. If you assign file type to `text` `csv`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text tyrantlucifer#26#male ``` If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | If you assign file type to `binary`, SeaTunnel can synchronize files in any format, such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization at the same time. You can find the specific usage in the example below. If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### bucket [string] The bucket address of Cos file system, for example: `Cos://tyrantlucifer-image-bed` ### secret_id [string] The secret id of Cos file system. ### secret_key [string] The secret key of Cos file system. ### region [string] The region of cos file system. ### read_columns [list] The read column list of the data source, user can use it to implement field projection. ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. Field delimiter, used to tell connector how to slice and dice fields default `\001`, the same as hive's default delimiter ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### parse_partition_from_path [boolean] Control whether parse the partition keys and values from file path For example if you read a file from path `cosn://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` Every record data from file will be added these two fields: | name | age | |---------------|-----| | tyrantlucifer | 26 | Tips: **Do not define partition fields in schema option** ### skip_header_row_number [long] Skip the first few lines, but only for the txt and csv. For example, set like following: `skip_header_row_number = 2` then SeaTunnel will skip the first 2 lines from source files ### date_format [string] Date type format, used to tell connector how to convert string to date, supported as the following formats: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` ### datetime_format [string] Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` default `yyyy-MM-dd HH:mm:ss` ### time_format [string] Time type format, used to tell connector how to convert string to time, supported as the following formats: `HH:mm:ss` `HH:mm:ss.SSS` default `HH:mm:ss` ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). #### fields [Config] The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### sheet_name [string] Only need to be configured when file_format is excel. Reader the sheet of the workbook. ### xml_row_tag [string] Only need to be configured when file_format is xml. Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Only need to be configured when file_format is xml. Specifies Whether to process data using the tag attribute format. ### csv_use_header_line [boolean] Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### filename_extension [string] Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### file_filter_modified_start [string] File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### file_filter_modified_end [string] File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Example ```hocon CosFile { path = "/seatunnel/orc" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "orc" } ``` ```hocon CosFile { path = "/seatunnel/json" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "json" schema { fields { id = int name = string } } } ``` ### Transfer Binary File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // you can transfer local file to s3/hdfs/oss etc. CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary/" file_format_type = "binary" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/DB2.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DB2 > JDBC DB2 Source Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Read external data source data through JDBC. ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| | DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | DB2 Data Type | SeaTunnel Data Type | |------------------------------------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | SMALLINT | SHORT | | INT
INTEGER
| INTEGER | | BIGINT | LONG | | DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | | REAL | FLOAT | | FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | | CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | | BLOB | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | ROWID
XML | Not supported yet | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from table_xxx" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` source { Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Databend.md ================================================ import ChangeLog from '../changelog/connector-databend.md'; # Databend > Databend source connector ## Supported Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [Batch Processing](../../introduction/concepts/connector-v2-features.md) - [ ] [Stream Processing](../../introduction/concepts/connector-v2-features.md) - [x] [Parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [Support User-defined Sharding](../../introduction/concepts/connector-v2-features.md) - [ ] [Support Multi-table Reading](../../introduction/concepts/connector-v2-features.md) ## Description A source connector for reading data from Databend. ## Dependencies ### For Spark/Flink > 1. You need to download the [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) and add it to the directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta > 1. You need to download the [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) and add it to the directory `${SEATUNNEL_HOME}/lib/`. ## Supported Data Source Information | Data Source | Supported Version | Driver | URL | Maven | |-------------|-------------------|--------|-----|-------| | Databend | 1.2.x and above | - | - | - | ## Data Type Mapping | Databend Data Type | SeaTunnel Data Type | |-------------------|-------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | STRING | STRING | | VARCHAR | STRING | | CHAR | STRING | | TIMESTAMP | TIMESTAMP | | DATE | DATE | | TIME | TIME | | BINARY | BYTES | ## Source Options Basic Configuration: | Name | Type | Required | Default Value | Description | |------|------|----------|---------------|-------------| | url | String | Yes | - | Databend JDBC connection URL | | username | String | Yes | - | Databend database username | | password | String | Yes | - | Databend database password | | database | String | No | - | Databend database name, defaults to the database name specified in the connection URL | | table | String | No | - | Databend table name | | query | String | No | - | Databend query statement, if set will override database and table settings | | fetch_size | Integer | No | 0 | Number of records to fetch from database at once, set to 0 to use JDBC driver default value | | jdbc_config | Map | No | - | Additional JDBC connection configuration, such as load balancing strategies | Table List Configuration: | Name | Type | Required | Default Value | Description | |------|------|----------|---------------|-------------| | database | String | Yes | - | Database name | | table | String | Yes | - | Table name | | query | String | No | - | Custom query statement | | fetch_size | Integer | No | 0 | Number of records to fetch from database at once | Note: When this configuration corresponds to a single table, you can flatten the configuration items from table_list to the outer level. ## Task Examples ### Single Table Reading ```hocon env { parallelism = 2 job.mode = "BATCH" } source { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "users" } } sink { Console {} } ``` ### Using Custom Query ```hocon source { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" query = "SELECT id, name, age FROM default.users WHERE age > 18" } } ``` ## Related Links - [Databend Official Website](https://databend.rs/) - [Databend JDBC Driver](https://github.com/databendlabs/databend-jdbc/) ## Changelog ================================================ FILE: docs/en/connectors/source/Doris.md ================================================ import ChangeLog from '../changelog/connector-doris.md'; # Doris > Doris source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table read](../../introduction/concepts/connector-v2-features.md) ## Description Used to read data from Apache Doris. ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|--------------------------------------|--------|-----|-------| | Doris | Only Doris2.0 or later is supported. | - | - | - | ## Data Type Mapping | Doris Data type | SeaTunnel Data type | |--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | INT | INT | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | BIGINT | BIGINT | | LARGEINT | STRING | | BOOLEAN | BOOLEAN | | DECIMAL | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | CHAR
VARCHAR
STRING
TEXT | STRING | | DATE | DATE | | DATETIME
DATETIME(p) | TIMESTAMP | | ARRAY | ARRAY | ## Source Options Base configuration: | Name | Type | Required | Default | Description | |----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------| | fenodes | string | yes | - | FE address, the format is `"fe_host:fe_http_port"` | | username | string | yes | - | User username | | password | string | yes | - | User password | | doris.request.retries | int | no | 3 | Number of retries to send requests to Doris FE. | | doris.request.read.timeout.ms | int | no | 30000 | | | doris.request.connect.timeout.ms | int | no | 30000 | | | query-port | string | no | 9030 | Doris QueryPort | | doris.request.query.timeout.s | int | no | 3600 | Timeout period of Doris scan data, expressed in seconds. | | table_list | string | 否 | - | table list | Table list configuration: | Name | Type | Required | Default | Description | |----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------| | database | string | yes | - | The name of Doris database | | table | string | yes | - | The name of Doris table | | doris.read.field | string | no | - | Use the 'doris.read.field' parameter to select the doris table columns to read | | doris.filter.query | string | no | - | Data filtering in doris. the format is "field = value",example : doris.filter.query = "F_ID > 2" | | doris.batch.size | int | no | 1024 | The maximum value that can be obtained by reading Doris BE once. | | doris.exec.mem.limit | long | no | 2147483648 | Maximum memory that can be used by a single be scan request. The default memory is 2G (2147483648). | Note: When this configuration corresponds to a single table, you can flatten the configuration items in table_list to the outer layer. ### Tips > It is not recommended to modify advanced parameters at will ## Example ### single table > This is an example of reading a Doris table and writing to Console. ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` Use the 'doris.read.field' parameter to select the doris table columns to read ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT,F_SMALLINT" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` Use 'doris.filter.query' to filter the data, and the parameter values are passed directly to doris ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" doris.filter.query = "F_ID > 2" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### Multiple table ``` env{ parallelism = 1 job.mode = "BATCH" } source{ Doris { fenodes = "xxxx:8030" username = root password = "" table_list = [ { database = "st_source_0" table = "doris_table_0" doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT" doris.filter.query = "F_ID >= 50" }, { database = "st_source_1" table = "doris_table_1" } ] } } transform {} sink{ Doris { fenodes = "xxxx:8030" schema_save_mode = "RECREATE_SCHEMA" username = root password = "" database = "st_sink" table = "${table_name}" sink.enable-2pc = "true" sink.label-prefix = "test_json" doris.config = { format="json" read_json_by_line="true" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/DuckDB.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DuckDB > JDBC DuckDB Source Connector ## Description Read external data source data through JDBC. ## Support DuckDB Version - 0.8.x/0.9.x/0.10.x/1.x ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|-------------------------|----------------------------------|-----------------------------------------------------------------------| | DuckDB | Different dependency version has different driver class. | org.duckdb.DuckDBDriver | jdbc:duckdb:/path/to/database.db | [Download](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) | ## Data Type Mapping | DuckDB Data Type | SeaTunnel Data Type | |---------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | UTINYINT
SMALLINT | SMALLINT | | USMALLINT
INTEGER | INT | | UINTEGER
BIGINT | BIGINT | | UBIGINT | DECIMAL(20,0) | | HUGEINT | DECIMAL(38,0) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | VARCHAR
CHAR
TEXT
JSON
UUID
INTERVAL | STRING | | DATE | DATE | | TIME | TIME | | TIMESTAMP
TIMESTAMP WITH TIME ZONE | TIMESTAMP | | BLOB
ARRAY
STRUCT
MAP | BYTES | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:duckdb:/path/to/database.db | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DuckDB the value is `org.duckdb.DuckDBDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
the row fetch size used in the query to improve performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters, when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in DuckDB, properties take precedence over the URL. | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
examples:
duckdb: "main.table1"
| | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "main.table1"}, {table_path = "main.table2", query = "select * id, name from main.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ### Options Related To Split #### split.size How many rows in one split, captured tables are split into multiple splits when read of table. #### partition_column [string] The column name for split data. #### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. #### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. #### partition_num [int] > Not recommended for use, The correct approach is to control the number of split through `split.size` How many splits do we need to split into, only support positive integer. default value is job parallelism. ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. ## Task Example ### Simple > This example queries 'user_events' table in your test database in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" query = "select * from user_events limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### parallel by partition_column ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" query = "select * from user_events" partition_column = "id" split.size = 10000 # Read start boundary #partition_lower_bound = ... # Read end boundary #partition_upper_bound = ... } } sink { Console {} } ``` ### parallel by Primary Key or Unique Index > Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "" password = "" table_path = "main.user_events" query = "select * from main.user_events" split.size = 10000 } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" # Define query logic as required query = "select * from user_events" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 properties { threads=4 memory_limit="4GB" } } } ``` ### Multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" table_list = [ { table_path = "main.table1" }, { table_path = "main.table2" # Use query filetr rows & columns query = "select id, name from main.table2 where id > 100" } ] #where_condition= "where id > 100" #split.size = 8096 } } sink { Console {} } ``` ## Change Log ================================================ FILE: docs/en/connectors/source/Easysearch.md ================================================ import ChangeLog from '../changelog/connector-easysearch.md'; # Easysearch > Easysearch source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to read data from INFINI Easysearch. ## Using Dependency > Depenndency [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) :::tip Engine Supported * Supported all versions released by [INFINI Easysearch](https://www.infini.com/download/?product=easysearch). ::: ## Data Type Mapping | Easysearch Data Type | SeaTunnel Data Type | |-----------------------------|----------------------| | STRING
KEYWORD
TEXT | STRING | | BOOLEAN | BOOLEAN | | BYTE | BYTE | | SHORT | SHORT | | INTEGER | INT | | LONG | LONG | | FLOAT
HALF_FLOAT | FLOAT | | DOUBLE | DOUBLE | | Date | LOCAL_DATE_TIME_TYPE | ### hosts [array] Easysearch cluster http address, the format is `host:port`, allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. ### username [string] security username. ### password [string] security password. ### index [string] Easysearch index name, support * fuzzy matching. ### source [array] The fields of index. You can get the document id by specifying the field `_id`.If sink _id to other index,you need specify an alias for _id due to the Easysearch limit. If you don't config source, you must config `schema`. ### query [json] Easysearch DSL. You can control the range of data read. ### scroll_time [String] Amount of time Easysearch will keep the search context alive for scroll requests. ### scroll_size [int] Maximum number of hits to be returned with each Easysearch scroll request. ### schema The structure of the data, including field names and field types. If you don't config schema, you must config `source`. ### tls_verify_certificate [boolean] Enable certificates validation for HTTPS endpoints ### tls_verify_hostname [boolean] Enable hostname validation for HTTPS endpoints ### tls_keystore_path [string] The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. ### tls_keystore_password [string] The key password for the key store specified ### tls_truststore_path [string] The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. ### tls_truststore_password [string] The key password for the trust store specified ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Examples simple ```hocon Easysearch { hosts = ["localhost:9200"] index = "seatunnel-*" source = ["_id","name","age"] query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} } ``` complex ```hocon Easysearch { hosts = ["Easysearch:9200"] index = "st_index" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} } ``` SSL (Disable certificates validation) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_certificate = false } } ``` SSL (Disable hostname validation) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_hostname = false } } ``` SSL (Enable certificates validation) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Elasticsearch.md ================================================ import ChangeLog from '../changelog/connector-elasticsearch.md'; # Elasticsearch > Elasticsearch source connector ## Description Used to read data from Elasticsearch. support version >= 2.x and <= 8.x. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------------|---------|----------|----------------------------------------------------------------| | hosts | array | yes | - | | auth_type | string | no | basic | | username | string | no | - | | password | string | no | - | | auth.api_key_id | string | no | - | | auth.api_key | string | no | - | | auth.api_key_encoded | string | no | - | | index | string | no | If the index list does not exist, the index must be configured | | index_list | array | no | used to define a multiple table task | | source | array | no | - | | query | json | no | {"match_all": {}} | | search_type | enum | no | Query type, SQL or DSL, default DSL | | search_api_type | enum | no | Pagination API type, SCROLL or PIT, default SCROLL | | sql_query | json | no | SQL query, required when search_type is SQL | | scroll_time | string | no | 1m | | scroll_size | int | no | 100 | | tls_verify_certificate | boolean | no | true | | tls_verify_hostname | boolean | no | true | | array_column | map | no | | | tls_keystore_path | string | no | - | | tls_keystore_password | string | no | - | | tls_truststore_path | string | no | - | | tls_truststore_password | string | no | - | | pit_keep_alive | long | no | 60000 (1 minute) | | pit_batch_size | int | no | 100 | | runtime_fields | array | no | - | | common-options | | no | - | ### hosts [array] Elasticsearch cluster http address, the format is `host:port`, allowing multiple hosts to be specified. Such as `["host1:9200", "host2:9200"]`. ## Authentication The Elasticsearch connector supports multiple authentication methods to connect to secured Elasticsearch clusters. You can choose the appropriate authentication method based on your Elasticsearch security configuration. ### auth_type [enum] Specifies the authentication method to use. Supported values: - `basic` (default): HTTP Basic Authentication using username and password - `api_key`: Elasticsearch API Key authentication using separate ID and key - `api_key_encoded`: Elasticsearch API Key authentication using encoded key If not specified, defaults to `basic` for backward compatibility. ### Basic Authentication Basic authentication uses HTTP Basic Authentication with username and password credentials. #### username [string] Username for basic authentication (x-pack username). #### password [string] Password for basic authentication (x-pack password). **Example:** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "basic" username = "elastic" password = "your_password" index = "my_index" } } ``` ### API Key Authentication API Key authentication provides a more secure way to authenticate with Elasticsearch using API keys. #### auth.api_key_id [string] The API key ID generated by Elasticsearch. #### auth.api_key [string] The API key secret generated by Elasticsearch. #### auth.api_key_encoded [string] Base64 encoded API key in the format `base64(id:api_key)`. This is an alternative to specifying `auth.api_key_id` and `auth.api_key` separately. **Note:** You can use either `auth.api_key_id` + `auth.api_key` OR `auth.api_key_encoded`, but not both. **Example with separate ID and key:** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key" auth.api_key_id = "your_api_key_id" auth.api_key = "your_api_key_secret" index = "my_index" } } ``` **Example with encoded key:** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key_encoded" auth.api_key_encoded = "eW91cl9hcGlfa2V5X2lkOnlvdXJfYXBpX2tleV9zZWNyZXQ=" index = "my_index" } } ``` ### index [string] Elasticsearch index name, support * fuzzy matching. ### source [array] The fields of index. You can get the document id by specifying the field `_id`.If sink _id to other index,you need specify an alias for _id due to the Elasticsearch limit. If you don't config source, it is automatically retrieved from the mapping of the index. ### array_column [map] The fields of array type. Since there is no array index in es,so need assign array type,just like `{c_array = "array"}`. ### query [json] Elasticsearch DSL. You can control the range of data read. ### scroll_time [String] Amount of time Elasticsearch will keep the search context alive for scroll requests. ### scroll_size [int] Maximum number of hits to be returned with each Elasticsearch scroll request. ### index_list [array] The `index_list` is used to define multi-index synchronization tasks. It is an array that contains the parameters required for single-table synchronization, such as `query`, `source/schema`, `scroll_size`, and `scroll_time`. It is recommended that `index_list` and `query` should not be configured at the same level simultaneously. Please refer to the upcoming multi-table synchronization example for more details. ### tls_verify_certificate [boolean] Enable certificates validation for HTTPS endpoints ### tls_verify_hostname [boolean] Enable hostname validation for HTTPS endpoints ### tls_keystore_path [string] The path to the PEM or JKS key store. This file must be readable by the operating system user running SeaTunnel. ### tls_keystore_password [string] The key password for the key store specified ### tls_truststore_path [string] The path to PEM or JKS trust store. This file must be readable by the operating system user running SeaTunnel. ### tls_truststore_password [string] The key password for the trust store specified ### search_type Query type, available values: - DSL: Use Domain Specific Language query (default) - SQL: Use SQL query ### search_api_type Pagination API type, available values: - SCROLL: Use Scroll API for pagination (default) - PIT: Use Point in Time (PIT) API for pagination ### pit_keep_alive [long] The amount of time (in milliseconds) for which the PIT should be keep alive ### pit_batch_size [int] Maximum number of hits to be returned with each PIT search request ### runtime_fields [array] Runtime fields to be computed at query time (Elasticsearch 7.11+). Each runtime field should contain: - **name**: The name of the runtime field - **type**: The data type (boolean, date, double, geo_point, ip, keyword, long) - **script**: Painless script to compute the field value - **script_lang** (optional): Script language (default: painless) - **script_params** (optional): Script parameters Example: ```hocon runtime_fields = [ { name = "day_of_week" type = "keyword" script = "emit(doc['timestamp'].value.dayOfWeekEnum.toString())" }, { name = "total_price" type = "double" script = "emit(doc['quantity'].value * doc['price'].value)" } ] ``` **Runtime Fields Use Cases:** 1. **Date Extraction**: Extract day of week, month, year from timestamps 2. **Calculations**: Compute derived values like total price, tax amount 3. **String Operations**: Concatenate fields, extract substrings 4. **Conditional Logic**: Categorize data based on conditions 5. **Data Transformation**: Convert units, format values on-the-fly **Performance Considerations:** - Runtime fields are computed at query time, which may impact performance for large datasets - Best suited for ad-hoc analysis, prototyping, and infrequent queries - Keep scripts simple to minimize performance impact - Consider indexing frequently used computed fields **Limitations:** - Requires Elasticsearch 7.11 or higher - Only Painless scripts are supported - May be slower than indexed fields for large-scale queries ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Examples Demo 1 > This case will read data from indices matching the seatunnel-* pattern based on a query. The query will only return documents containing the id, name, age, tags, and phones fields. In this example, the source field configuration is used to specify which fields should be read, and the array_column is used to indicate that tags and phones should be treated as arrays. ```hocon Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-*" array_column = {tags = "array",phones = "array"} source = ["_id","name","age","tags","phones"] query = {"range":{"firstPacket":{"gte":1669225429990,"lte":1669225429990}}} } ``` Demo 2 : Multi-table synchronization > This example demonstrates how to read different data from ``read_index1`` and ``read_index2`` and write separately to ``read_index1_copy``,``read_index2_copy``. > in `read_index1`,I used source to specify the fields to be read and specify which fields are array fields using the 'array_column'. ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index_list = [ { index = "read_index1" query = {"range": {"c_int": {"gte": 10, "lte": 20}}} source = [ c_map, c_array, c_string, c_boolean, c_tinyint, c_smallint, c_bigint, c_float, c_double, c_decimal, c_bytes, c_int, c_date, c_timestamp] array_column = { c_array = "array" } } { index = "read_index2" query = {"match_all": {}} source = [ c_int2, c_date2, c_null ] } ] } } transform { } sink { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "${table_name}_copy" index_type = "st" "schema_save_mode"="CREATE_SCHEMA_WHEN_NOT_EXIST" "data_save_mode"="APPEND_DATA" } } ``` Demo 3 : SSL (Disable certificates validation) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false } } ``` Demo 4 :SSL (Disable hostname validation) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_hostname = false } } ``` Demo 5 :SSL (Enable certificates validation) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` Demo 6 : sql query notes: sql does not support map and array types ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "st_index_sql" sql_query = "select * from st_index_sql where c_int>=10 and c_int<=20" search_type = "sql" } } ``` Demo7: PIT ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "st_index" query = {"range": {"c_int": {"gte": 10, "lte": 20}}} # Use DSL query with PIT API search_type = DSL search_api_type = PIT pit_keep_alive = 60000 # 1 minute in milliseconds pit_batch_size = 100 } } ``` Demo 8: Runtime Fields (Elasticsearch 7.11+) > This example demonstrates how to use runtime fields to compute values at query time without reindexing data. ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "sales_data" # Define runtime fields for dynamic computation runtime_fields = [ { # Calculate total amount name = "total_amount" type = "double" script = "emit(doc['quantity'].value * doc['price'].value)" }, { # Extract day of week from timestamp name = "day_of_week" type = "keyword" script = "emit(doc['order_date'].value.dayOfWeekEnum.getDisplayName(TextStyle.FULL, Locale.ROOT))" }, { # Categorize orders name = "order_category" type = "keyword" script = """ double amount = doc['quantity'].value * doc['price'].value; if (amount > 1000) { emit('high_value'); } else if (amount > 100) { emit('medium_value'); } else { emit('low_value'); } """ }, { # Calculate with parameters name = "price_with_tax" type = "double" script = "emit(doc['price'].value * (1 + params.tax_rate))" script_params = { tax_rate = 0.13 } } ] # Include runtime fields in the output source = [ "product_id", "quantity", "price", "order_date", "total_amount", "day_of_week", "order_category", "price_with_tax" ] schema = { fields { product_id = string quantity = int price = double order_date = timestamp total_amount = double day_of_week = string order_category = string price_with_tax = double } } } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/FakeSource.md ================================================ import ChangeLog from '../changelog/connector-fake.md'; # FakeSource > FakeSource connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description The FakeSource is a virtual data source, which randomly generates the number of rows according to the data structure of the user-defined schema, just for some test cases such as type conversion or connector new feature testing ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Source Options | Name | Type | Required | Default | Description | |-------------------------|----------|----------|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | tables_configs | list | no | - | Define Multiple FakeSource, each item can contains the whole fake source config description below | | schema | config | yes | - | Define Schema information. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | auto.increment.enabled | boolean | no | false | Enable auto increment ID generation | | auto.increment.start | int | no | | Starting value for auto increment ID | | rows | config | no | - | The row list of fake data output per degree of parallelism see title `Options rows Case`. | | row.num | int | no | 5 | The total number of data generated per degree of parallelism | | split.num | int | no | 1 | the number of splits generated by the enumerator for each degree of parallelism | | split.read-interval | long | no | 1 | The interval(mills) between two split reads in a reader | | map.size | int | no | 5 | The size of `map` type that connector generated | | array.size | int | no | 5 | The size of `array` type that connector generated | | bytes.length | int | no | 5 | The length of `bytes` type that connector generated | | string.length | int | no | 5 | The length of `string` type that connector generated | | string.fake.mode | string | no | range | The fake mode of generating string data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `string.template` option | | string.template | list | no | - | The template list of string type that connector generated, if user configured it, connector will randomly select an item from the template list | | tinyint.fake.mode | string | no | range | The fake mode of generating tinyint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `tinyint.template` option | | tinyint.min | tinyint | no | 0 | The min value of tinyint data that connector generated | | tinyint.max | tinyint | no | 127 | The max value of tinyint data that connector generated | | tinyint.template | list | no | - | The template list of tinyint type that connector generated, if user configured it, connector will randomly select an item from the template list | | smallint.fake.mode | string | no | range | The fake mode of generating smallint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `smallint.template` option | | smallint.min | smallint | no | 0 | The min value of smallint data that connector generated | | smallint.max | smallint | no | 32767 | The max value of smallint data that connector generated | | smallint.template | list | no | - | The template list of smallint type that connector generated, if user configured it, connector will randomly select an item from the template list | | int.fake.template | string | no | range | The fake mode of generating int data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `int.template` option | | int.min | int | no | 0 | The min value of int data that connector generated | | int.max | int | no | 0x7fffffff | The max value of int data that connector generated | | int.template | list | no | - | The template list of int type that connector generated, if user configured it, connector will randomly select an item from the template list | | bigint.fake.mode | string | no | range | The fake mode of generating bigint data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `bigint.template` option | | bigint.min | bigint | no | 0 | The min value of bigint data that connector generated | | bigint.max | bigint | no | 0x7fffffffffffffff | The max value of bigint data that connector generated | | bigint.template | list | no | - | The template list of bigint type that connector generated, if user configured it, connector will randomly select an item from the template list | | float.fake.mode | string | no | range | The fake mode of generating float data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `float.template` option | | float.min | float | no | 0 | The min value of float data that connector generated | | float.max | float | no | 0x1.fffffeP+127 | The max value of float data that connector generated | | float.template | list | no | - | The template list of float type that connector generated, if user configured it, connector will randomly select an item from the template list | | double.fake.mode | string | no | range | The fake mode of generating float data, support `range` and `template`, default `range`,if use configured it to `template`, user should also configured `double.template` option | | double.min | double | no | 0 | The min value of double data that connector generated | | double.max | double | no | 0x1.fffffffffffffP+1023 | The max value of double data that connector generated | | double.template | list | no | - | The template list of double type that connector generated, if user configured it, connector will randomly select an item from the template list | | vector.dimension | int | no | 4 | Dimension of the generated vector, excluding binary vectors | | binary.vector.dimension | int | no | 8 | Dimension of the generated binary vector | | vector.float.min | float | no | 0 | The min value of float data in vector that connector generated | | vector.float.max | float | no | 0x1.fffffeP+127 | The max value of float data in vector that connector generated | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Task Example ### Simple > This example Randomly generates data of a specified type. If you want to learn how to declare field types, click [here](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported). ```hocon schema = { fields { c_map = "map>" c_map_nest = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } ``` ### Random Generation > 16 data matching the type are randomly generated ```hocon source { # This is a example input plugin **only for test and demonstrate the feature input plugin** FakeSource { row.num = 16 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } plugin_output = "fake" } } ``` ### Customize the data content Simple > This is a self-defining data source information, defining whether each piece of data is an add or delete modification operation, and defining what each field stores ```hocon source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } rows = [ { kind = INSERT fields = [{"a": "b"}, [101], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = UPDATE_BEFORE fields = [{"a": "c"}, [102], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = UPDATE_AFTER fields = [{"a": "e"}, [103], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = DELETE fields = [{"a": "f"}, [104], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } ] } } ``` > Due to the constraints of the [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) specification, users cannot directly create byte sequence objects. FakeSource uses strings to assign `bytes` type values. In the example above, the `bytes` type field is assigned `"bWlJWmo="`, which is encoded from "miIZj" with **base64**. Hence, when assigning values to `bytes` type fields, please use strings encoded with **base64**. ### Specified Data number Simple > This case specifies the number of data generated and the length of the generated value ```hocon FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } ``` ### Template data Simple > Randomly generated according to the specified template Using template ```hocon FakeSource { row.num = 5 string.fake.mode = "template" string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] tinyint.fake.mode = "template" tinyint.template = [1, 2, 3, 4, 5, 6, 7, 8, 9] smalling.fake.mode = "template" smallint.template = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] int.fake.mode = "template" int.template = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] bigint.fake.mode = "template" bigint.template = [30, 31, 32, 33, 34, 35, 36, 37, 38, 39] float.fake.mode = "template" float.template = [40.0, 41.0, 42.0, 43.0] double.fake.mode = "template" double.template = [44.0, 45.0, 46.0, 47.0] schema { fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ``` ### Range data Simple > The specified data generation range is randomly generated ```hocon FakeSource { row.num = 5 string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] tinyint.min = 1 tinyint.max = 9 smallint.min = 10 smallint.max = 19 int.min = 20 int.max = 29 bigint.min = 30 bigint.max = 39 float.min = 40.0 float.max = 43.0 double.min = 44.0 double.max = 47.0 schema { fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ``` ### Generate Multiple tables > This is a case of generating a multi-data source test.table1 and test.table2 ```hocon FakeSource { tables_configs = [ { row.num = 16 schema { table = "test.table1" fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } }, { row.num = 17 schema { table = "test.table2" fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ] } ``` ### Options `rows` Case ```hocon rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100] }, { kind = DELETE fields = [1, "A_1", 100] } ] ``` ### Options `table-names` Case ```hocon source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { table-names = ["test.table1", "test.table2", "test.table3"] parallelism = 1 schema = { fields { name = "string" age = "int" } } } } ``` ### Options `defaultValue` Case Custom data can be generated by `row` and `columns`. For the time type, obtain the current time by `CURRENT_TIMESTAMP` 、`CURRENT_TIME` 、 `CURRENT_DATE` ```hocon schema = { fields { pk_id = bigint name = string score = int time1 = timestamp time2 = time time3 = date } } # use rows rows = [ { kind = INSERT fields = [1, "A", 100, CURRENT_TIMESTAMP, CURRENT_TIME, CURRENT_DATE] } ] ``` ```hocon schema = { # use columns columns = [ { name = book_publication_time type = timestamp defaultValue = "2024-09-12 15:45:30" comment = "book publication time" }, { name = book_publication_time2 type = timestamp defaultValue = CURRENT_TIMESTAMP comment = "book publication time2" }, { name = book_publication_time3 type = time defaultValue = "15:45:30" comment = "book publication time3" }, { name = book_publication_time4 type = time defaultValue = CURRENT_TIME comment = "book publication time4" }, { name = book_publication_time5 type = date defaultValue = "2024-09-12" comment = "book publication time5" }, { name = book_publication_time6 type = date defaultValue = CURRENT_DATE comment = "book publication time6" } ] } ``` ### Use Vector Example ```hocon source { FakeSource { row.num = 10 # Low priority vector.dimension= 4 binary.vector.dimension = 8 # Low priority schema = { table = "simple_example" columns = [ { name = book_id type = bigint nullable = false defaultValue = 0 comment = "primary key id" }, { name = book_intro_1 type = binary_vector columnScale =8 comment = "vector" }, { name = book_intro_2 type = float16_vector columnScale =4 comment = "vector" }, { name = book_intro_3 type = bfloat16_vector columnScale =4 comment = "vector" }, { name = book_intro_4 type = sparse_float_vector columnScale =4 comment = "vector" } ] } } } ``` ### Auto-increment primary key Example ```hocon source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { plugin_output = "fake" auto.increment.enabled = true auto.increment.start = 1000 row.num = 50000 schema = { fields { id = "int" name = "string" age = "int" } primaryKey { name = "pk" columnNames = [id] } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/FtpFile.md ================================================ import ChangeLog from '../changelog/connector-file-ftp.md'; # FtpFile > Ftp file source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from ftp file server. :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ::: ## Options | name | type | required | default value | |-----------------------------|---------|----------|-----------------------------| | host | string | yes | - | | port | int | yes | - | | user | string | yes | - | | password | string | yes | - | | path | string | yes | - | | file_format_type | string | yes | - | | connection_mode | string | no | active_local | | remote_verification_enabled | boolean | no | true | | delimiter/field_delimiter | string | no | \001 for text and , for csv | | row_delimiter | string | no | \n | | read_columns | list | no | - | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | | time_format | string | no | HH:mm:ss | | skip_header_row_number | long | no | 0 | | schema | config | no | - | | sheet_name | string | no | - | | xml_row_tag | string | no | - | | xml_use_attr_format | boolean | no | - | | csv_use_header_line | boolean | no | - | | file_filter_pattern | string | no | - | | filename_extension | string | no | - | | compress_codec | string | no | none | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | null_format | string | no | - | | binary_chunk_size | int | no | 1024 | | binary_complete_file_mode | boolean | no | false | | sync_mode | string | no | full | | target_path | string | no | - | | target_hadoop_conf | map | no | - | | update_strategy | string | no | distcp | | compare_mode | string | no | len_mtime | | common-options | | no | - | | file_filter_modified_start | string | no | - | | file_filter_modified_end | string | no | - | | quote_char | string | no | " | | escape_char | string | no | - | | metalake_type | string | no | gravitino | ### host [string] The target ftp host is required ### port [int] The target ftp port is required ### user [string] The target ftp user name is required ### password [string] The target ftp password is required ### path [string] The source file path. ### remote_verification_enabled [boolean] Whether to enable remote host verification for FTP data channels, default is `true`. ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### filename_extension [string] Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` If you assign file type to `json` , you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | If you assign file type to `text` `csv`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text tyrantlucifer#26#male ``` If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | If you assign file type to `binary`, SeaTunnel can synchronize files in any format, such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization at the same time. You can find the specific usage in the example below. If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### connection_mode [string] The target ftp connection mode , default is active mode, supported as the following modes: `active_local` `passive_local` ### control_encoding [string] Character encoding for FTP control connection. Default is `UTF-8`. When file paths contain special characters (such as `$`, spaces, Chinese characters, etc.), this should be set to `UTF-8` to ensure paths can be parsed correctly. For example: `/data/whale_ops/share/$Fund-Product/DA - SANY (三一)/Daily/2025.08.18/file.xlsx` ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. Field delimiter, used to tell connector how to slice and dice fields. default `\001`, the same as hive's default delimiter ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### parse_partition_from_path [boolean] Control whether parse the partition keys and values from file path For example if you read a file from path `ftp://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` Every record data from file will be added these two fields: | name | age | |---------------|-----| | tyrantlucifer | 26 | Tips: **Do not define partition fields in schema option** ### date_format [string] Date type format, used to tell connector how to convert string to date, supported as the following formats: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` ### datetime_format [string] Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` default `yyyy-MM-dd HH:mm:ss` ### time_format [string] Time type format, used to tell connector how to convert string to time, supported as the following formats: `HH:mm:ss` `HH:mm:ss.SSS` default `HH:mm:ss` ### skip_header_row_number [long] Skip the first few lines, but only for the txt and csv. For example, set like following: `skip_header_row_number = 2` then SeaTunnel will skip the first 2 lines from source files ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). The schema information of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### schema_url [string] Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). ### metalake_type [string] The type of metalake service, currently only supports `gravitino`. When using `schema_url` to obtain metadata from Gravitino, you can specify this parameter (default is `gravitino`). For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). ### read_columns [list] The read column list of the data source, user can use it to implement field projection. ### sheet_name [string] Reader the sheet of the workbook,Only used when file_format_type is excel. ### xml_row_tag [string] Only need to be configured when file_format is xml. Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Only need to be configured when file_format is xml. Specifies Whether to process data using the tag attribute format. ### csv_use_header_line [boolean] Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### null_format [string] Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### sync_mode [string] File sync mode. Supported values: `full` (default), `update`. When `update`, the source compares files between source/target and only reads new/changed files (currently only supports `file_format_type=binary`). **Performance considerations** - Update mode triggers an extra `getFileStatus` call on the target for each source file. - For remote file systems (FTP/SFTP), this adds per-file network overhead. It is not recommended for massive small-file scenarios. **Requirements / limitations** - `target_path` should typically align with sink `path` (same filesystem and same relative path layout). - When `update_strategy=distcp`, correctness depends on source/target clock synchronization. - When `compare_mode=checksum`, filesystem checksum support is required. If checksum is unavailable, SeaTunnel falls back to content comparison (more expensive) and logs a warning. Example: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). ### target_hadoop_conf [map] Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem. You can set `fs.defaultFS` in this map to override target defaultFS. ### update_strategy [string] Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. ### compare_mode [string] Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum` (only valid when `update_strategy=strict`). ### file_filter_modified_start [string] File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### file_filter_modified_end [string] File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Example ```hocon FtpFile { path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "text" schema = { name = string age = int } field_delimiter = "#" } ``` ### Multiple Table ```hocon FtpFile { tables_configs = [ { schema { table = "student" } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" }, { schema { table = "teacher" } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" } ] } ``` ```hocon FtpFile { tables_configs = [ { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" }, { schema { fields { name = string age = int } } path = "/apps/hive/demo/teacher" file_format_type = "json" } } ``` ### Transfer Binary File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // you can transfer local file to s3/hdfs/oss etc. FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### Incremental Sync (sync_mode=update, binary) `sync_mode=update` compares files between source and `target_path`, then only reads new/changed files. In most cases, `target_path` should be aligned with sink `path` (same filesystem and same relative paths). ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" sync_mode = "update" target_path = "/seatunnel/read/binary2/" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary2/" tmp_path = "/seatunnel/read/binary2-tmp/" file_format_type = "binary" } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Github.md ================================================ import ChangeLog from '../changelog/connector-http-github.md'; # Github > Github source connector ## Description Used to read data from Github. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | access_token | String | No | - | | method | String | No | get | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### access_token [String] Github personal access token, see: [Creating a personal access token - GitHub Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] the schema fields of upstream data ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Github { url = "https://api.github.com/orgs/apache/repos" access_token = "xxxx" method = "GET" format = "json" schema = { fields { id = int name = string description = string html_url = string stargazers_count = int forks = int } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Gitlab.md ================================================ import ChangeLog from '../changelog/connector-http-gitlab.md'; # Gitlab > Gitlab source connector ## Description Used to read data from Gitlab. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | access_token | String | Yes | - | | method | String | No | get | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### access_token [String] personal access token ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] the schema fields of upstream data ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Gitlab{ url = "https://gitlab.com/api/v4/projects" access_token = "xxxxx" schema { fields { id = int description = string name = string name_with_namespace = string path = string http_url_to_repo = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/GoogleSheets.md ================================================ import ChangeLog from '../changelog/connector-google-sheets.md'; # GoogleSheets > GoogleSheets source connector ## Description Used to read data from GoogleSheets. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [ ] file format - [ ] text - [ ] csv - [ ] json ## Options | name | type | required | default value | |---------------------|--------|----------|---------------| | service_account_key | string | yes | - | | sheet_id | string | yes | - | | sheet_name | string | yes | - | | range | string | yes | - | | schema | config | no | - | ### service_account_key [string] google cloud service account, base64 required ### sheet_id [string] sheet id in a Google Sheets URL ### sheet_name [string] the name of the sheet you want to import ### range [string] the range of the sheet you want to import ### schema [config] #### fields [config] The schema fields of upstream data. Please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ## Example simple: ```hocon GoogleSheets { service_account_key = "seatunnel-test" sheet_id = "1VI0DvyZK-NIdssSdsDSsSSSC-_-rYMi7ppJiI_jhE" sheet_name = "sheets01" range = "A1:C3" schema = { fields { a = int b = string c = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/GraphQL.md ================================================ import ChangeLog from '../changelog/connector-graphql.md'; # GraphQL > GraphQL source connector ## Description Used to read data from GraphQL. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | | --------------------------- | ------- | -------- | ----------------------- | | url | String | Yes | - | | query | String | Yes | - | | variables | Config | No | - | | enable_subscription | boolean | No | false | | timeout | Long | No | - | | content_field | String | Yes | $.data.{query_object}.* | | schema.fields | Config | Yes | - | | params | Map | Yes | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### query [String] GraphQL expression query string ### variables [String] GraphQL Variables for example ``` variables = { limit = 2 } ``` ### enable_subscription [boolean] 1. true : Enable streaming subscription mode (WebSocket) 2. false : Enable batch query mode (HTTP) ### timeout [Long] Time-out Period ### content_field [String] JSONPath wildcard ### params [Map] http request params ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### schema [Config] Fill in a fixed value ```hocon schema = { fields { metric = "map" value = double time = long } } ``` #### fields [Config] the schema fields of upstream data ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ### Query ```hocon source { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" content_field = "$.data.source" query = """ query MyQuery($limit: Int) { source(limit: $limit) { id val_bool val_double val_float } } """ variables = { limit = 2 } schema = { fields { id = "int" val_bool = "boolean" val_double = "double" val_float = "float" } } } } ``` ### Subscription ```hocon source { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" content_field = "$.data.source" query = """ query MyQuery($limit: Int) { source(limit: $limit) { id val_bool val_double val_float } } """ variables = { limit = 2 } enable_subscription = true schema = { fields { id = "int" val_bool = "boolean" val_double = "double" val_float = "float" } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Greenplum.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Greenplum > Greenplum source connector ## Description Read Greenplum data through [Jdbc connector](Jdbc.md). ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) supports query SQL and can achieve projection effect. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) :::tip Optional jdbc drivers: - `org.postgresql.Driver` - `com.pivotal.jdbc.GreenplumDriver` Warn: for license compliance, if you use `GreenplumDriver` the have to provide Greenplum JDBC driver yourself, e.g. copy greenplum-xxx.jar to $SEATUNNEL_HOME/lib for Standalone. ::: ## Options ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Changelog ================================================ FILE: docs/en/connectors/source/Hbase.md ================================================ import ChangeLog from '../changelog/connector-hbase.md'; # Hbase > Hbase Source Connector ## Description Reads data from Apache Hbase. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [schema projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default | |----------------------|-----------|-----------|---------| | zookeeper_quorum | string | Yes | - | | table | string | Yes | - | | schema | config | Yes | - | | hbase_extra_config | config | No | - | | caching | int | No | -1 | | batch | int | No | -1 | | cache_blocks | boolean | No | false | | is_binary_rowkey | boolean | No | false | | start_rowkey | string | No | - | | end_rowkey | string | No | - | | start_row_inclusive | boolean | No | true | | end_row_inclusive | boolean | No | false | | start_timestamp | long | No | - | | end_timestamp | long | No | - | | common-options | | No | - | ### zookeeper_quorum [string] The zookeeper quorum for Hbase cluster hosts, e.g., "hadoop001:2181,hadoop002:2181,hadoop003:2181". ### table [string] The name of the table to write to, e.g., "seatunnel". If your table lives in a custom namespace, use the `namespace:table` form (for example, `ns1:seatunnel_test`); when the namespace is omitted SeaTunnel will read from HBase's default namespace (`default`). ### schema [config] Hbase stores data in byte arrays. Therefore, you need to configure the data types for each column in the table. For more information, see: [guide](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported). ### hbase_extra_config [config] Additional configurations for Hbase. ### caching The caching parameter sets the number of rows fetched per server trip during scans. This reduces round-trips between client and server, improving scan efficiency. Default: -1. ### batch The batch parameter sets the maximum number of columns returned per scan. This is useful for rows with many columns to avoid fetching excessive data at once, thus saving memory and improving performance. ### cache_blocks The cache_blocks parameter determines whether to cache data blocks during scans. By default, HBase caches data blocks during scans. Setting this to false reduces memory usage during scans. Default in SeaTunnel: false. ### is_binary_rowkey The row key in HBase can be either a text string or binary data. In SeaTunnel, the row key is set to a text string by default (i.e., the default value of is_binary_rowkey is false). ### start_rowkey The start row of the scan ### end_rowkey The stop row of the scan ### start_row_inclusive Whether to include the start row in the scan range. When set to true, the start row is included in the scan results. Default: true (inclusive). **Note:** In most cases, you should keep the default value (true). Only modify this parameter if you have specific requirements for excluding the start row from your scan results. ### end_row_inclusive Whether to include the end row in the scan range. When set to false, the end row is excluded from the scan results, following the left-closed-right-open convention [start, end). Default: false (exclusive). **Note:** In most cases, you should keep the default value (false) which follows HBase's standard left-closed-right-open convention. Only modify this parameter if you need to include the end row in your scan results. **Important:** When using parallel reading with multiple splits, the combination of these two parameters is critical for data integrity: - **Default (start_row_inclusive=true, end_row_inclusive=false)**: This is the recommended configuration that ensures no data loss or duplication across splits. Each split follows the [start, end) convention. - **Both false (start_row_inclusive=false, end_row_inclusive=false)**: This may cause **data loss** at split boundaries, as the boundary rows will be excluded from all splits. - **Both true (start_row_inclusive=true, end_row_inclusive=true)**: This may cause **duplicate data** at split boundaries, as the boundary rows will be included in multiple adjacent splits. ### start_timestamp Start timestamp (inclusive) for scan time range. Unit: milliseconds since epoch. The time range follows [start, end). If only start_timestamp is set, the end is treated as open-ended. ### end_timestamp End timestamp (exclusive) for scan time range. Unit: milliseconds since epoch. The time range follows [start, end). If only end_timestamp is set, the start is treated as open-ended. **Notes:** - `start_timestamp` / `end_timestamp` must be >= 0. If both are set, `start_timestamp` must be < `end_timestamp` (time range is [start, end), so `start_timestamp == end_timestamp` produces an empty scan). - When `start_rowkey` / `end_rowkey` and `start_timestamp` / `end_timestamp` are configured together, both the rowkey range and the time range constraints are applied (intersection). ### common-options Common parameters for Source plugins, refer to [Common Source Options](../common-options/source-common-options.md). ## Example ```bash source { Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "seatunnel_test" caching = 1000 batch = 100 cache_blocks = false is_binary_rowkey = false start_rowkey = "B" end_rowkey = "C" start_timestamp = 1700000000000 end_timestamp = 1700003600000 schema = { columns = [ { name = "rowkey" type = string }, { name = "columnFamily1:column1" type = boolean }, { name = "columnFamily1:column2" type = double }, { name = "columnFamily2:column1" type = bigint } ] } } } ``` ## Kerberos Example Note: - `connector-hbase` does not parse `krb5_path`, `kerberos_principal`, or `kerberos_keytab_path`. - Prepare Kerberos credentials and `krb5.conf` in the runtime environment (for example, `kinit -kt ...` or JVM `-Djava.security.krb5.conf=...`), and put HBase/Hadoop security settings into `hbase_extra_config`. ```hocon source { Hbase { zookeeper_quorum = "zk1:2181,zk2:2181,zk3:2181" table = "source_table" caching = 1000 batch = 200 cache_blocks = false is_binary_rowkey = false # HBase security config hbase_extra_config = { "hbase.security.authentication" = "kerberos" "hadoop.security.authentication" = "kerberos" "hbase.master.kerberos.principal" = "hbase/_HOST@REALM" "hbase.regionserver.kerberos.principal" = "hbase/_HOST@REALM" "hbase.rpc.protection" = "authentication" "hbase.zookeeper.useSasl" = "false" } schema = { columns = [ { name = "rowkey", type = string }, { name = "info:name", type = string }, { name = "info:score", type = string } ] } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/HdfsFile.md ================================================ import ChangeLog from '../changelog/connector-file-hadoop.md'; # HdfsFile > Hdfs File Source Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table read](../../introduction/concepts/connector-v2-features.md) - [x] file format file - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from hdfs file system. ## Supported DataSource Info | Datasource | Supported Versions | |------------|--------------------| | HdfsFile | hadoop 2.x and 3.x | ## Source Options | Name | Type | Required | Default | Description | |----------------------------|---------|----------|-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The source file path. | | file_format_type | string | yes | - | We supported as the following file types:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | | fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | | read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection.The file type supported column projection as the following shown:[text,json,csv,orc,parquet,excel,xml].Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured. | | hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | | delimiter/field_delimiter | string | no | \001 for text and , for csv | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter | | row_delimiter | string | no | \n | Row delimiter, used to tell connector how to slice and dice rows when reading text files. default `\n` | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields:[name:tyrantlucifer,age:26].Tips:Do not define partition fields in schema option. | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd`.Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` .default `yyyy-MM-dd HH:mm:ss` | | time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS`.default `HH:mm:ss` | | remote_user | string | no | - | The login user used to connect to hadoop login name. It is intended to be used for remote users in RPC, it won't have any credentials. | | krb5_path | string | no | /etc/krb5.conf | The krb5 path of kerberos | | kerberos_principal | string | no | - | The principal of kerberos | | kerberos_keytab_path | string | no | - | The keytab path of kerberos | | skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv.For example, set like following:`skip_header_row_number = 2`.then Seatunnel will skip the first 2 lines from source files | | schema | config | no | - | the schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). **schema_url**: Get the http url of metadata information through restApi. When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). **metalake_type**: The type of metalake service, currently only supports `gravitino`. For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). | | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | filename_extension | string | no | - | Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. | | compress_codec | string | no | none | The compress codec of files | | archive_compress_codec | string | no | none | | | encoding | string | no | UTF-8 | | | null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` | | binary_chunk_size | int | no | 1024 | Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. | | binary_complete_file_mode | boolean | no | false | Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. | | sync_mode | string | no | full | File sync mode. Supported values: `full`, `update`. When `update`, the source compares files between source/target and only reads new/changed files (currently only supports `file_format_type=binary`). | | target_path | string | no | - | Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). | | target_hadoop_conf | map | no | - | Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem. You can set `fs.defaultFS` in this map to override target defaultFS. | | update_strategy | string | no | distcp | Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. | | compare_mode | string | no | len_mtime | Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum` (only valid when `update_strategy=strict`). | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../source-common-options.md) for details. | | file_filter_modified_start | string | no | - | File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | file_filter_modified_end | string | no | - | File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | enable_file_split | boolean | no | false | Turn on logical file split to improve parallelism for huge files. Only supported for `text`/`csv`/`json`/`parquet` and non-compressed format. | | file_split_size | long | no | 134217728 | Split size in bytes when `enable_file_split=true`. For `text`/`csv`/`json`, the split end will be aligned to the next `row_delimiter`. For `parquet`, the split unit is RowGroup and will never break a RowGroup. | | quote_char | string | no | " | A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. | | escape_char | string | no | - | A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. | | metalake_type | string | no | gravitino | The type of metalake service, currently supports `gravitino`. | ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### sync_mode [string] File sync mode. Supported values: `full` (default), `update`. When `sync_mode=update`, the source will compare files between source/target and only read new/changed files (currently only supports `file_format_type=binary`). ### target_path [string] Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). ### target_hadoop_conf [map] Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem (optional). If not set, it reuses the source filesystem configuration. You can set `fs.defaultFS` in this map to override target defaultFS, e.g. `"fs.defaultFS" = "hdfs://nn2:9000"`. ### update_strategy [string] Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. - `distcp`: similar to `distcp -update`: - target file not exists → COPY - length differs → COPY - `mtime(source) > mtime(target)` → COPY - else → SKIP - `strict`: strict consistency, decided by `compare_mode`. ### compare_mode [string] Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum`. - `len_mtime`: SKIP only when both `len` and `mtime` are equal, otherwise COPY. - `checksum`: SKIP only when `len` is equal and Hadoop `getFileChecksum` is equal, otherwise COPY (only valid when `update_strategy=strict`). ### enable_file_split [boolean] Turn on the file splitting function, the default is false. It can be selected when the file type is csv, text, json, parquet and non-compressed format. - `text`/`csv`/`json`: split by `file_split_size` and align to the next `row_delimiter` to avoid breaking records. - `parquet`: split by RowGroup (logical split), never breaks a RowGroup. **Recommendations** - Enable when reading a few large files and you want higher read parallelism. - Disable when reading many small files, or when parallelism is low (splitting adds overhead). **Limitations** - Not supported for compressed files (`compress_codec` != `none`) or archive files (`archive_compress_codec` != `none`) — it will fall back to non-splitting. - For `text`/`csv`/`json`, actual split size may be larger than `file_split_size` because the split end is aligned to the next `row_delimiter`. ### file_split_size [long] File split size, which can be filled in when the enable_file_split parameter is true. The unit is the number of bytes. The default value is the number of bytes of 128MB, which is 134217728. **Tuning** - Start with the default (128MB). Decrease it if parallelism is under-utilized; increase it if the number of splits is too large. - Rough rule: `file_split_size ≈ file_size / desired_parallelism`. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### Tips > If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ## Task Example ### Simple > This example defines a SeaTunnel synchronization task that read data from Hdfs and sends it to Hdfs. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connectors/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connectors/sink } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ### Multiple Table ```hocon env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { tables_configs = [ { schema = { table = "student" } path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" }, { schema = { table = "teacher" } path = "/apps/hive/demo/teacher" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" } ] } } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/${table_name}" file_format_type = "orc" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Hive.md ================================================ import ChangeLog from '../changelog/connector-hive.md'; # Hive > Hive source connector ## Description Read data from Hive. When using markdown format, SeaTunnel can parse markdown files stored in Hive tables and extract structured data with elements like headings, paragraphs, lists, code blocks, and tables. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. :::tip In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9 and 3.1.3 . If you use SeaTunnel Engine, You need put seatunnel-hadoop3-3.1.4-uber.jar and hive-exec-3.1.3.jar and libfb303-0.9.3.jar in $SEATUNNEL_HOME/lib/ dir. ::: ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [schema projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] markdown ## Options | name | type | required | default value | |-----------------------|--------|----------|----------------| | table_name | string | yes | - | | use_regex | boolean| no | false | | metastore_uri | string | yes | - | | krb5_path | string | no | /etc/krb5.conf | | kerberos_principal | string | no | - | | kerberos_keytab_path | string | no | - | | hdfs_site_path | string | no | - | | hive_site_path | string | no | - | | hive.hadoop.conf | Map | no | - | | hive.hadoop.conf-path | string | no | - | | read_partitions | list | no | - | | read_columns | list | no | - | | compress_codec | string | no | none | | common-options | | no | - | ### table_name [string] Target Hive table name eg: `db1.table1`. When `use_regex = true`, this field uses `databasePattern.tablePattern` (Hive has no schema) to match multiple tables from Hive metastore. ### use_regex [boolean] Whether to treat `table_name` as a regular expression pattern for matching multiple tables (whole database / subset). This also works inside each entry of `table_list` / `tables_configs`. Regex syntax notes: - The dot (`.`) is treated as the separator between database and table patterns (Hive only supports `database.table`). - Only one unescaped dot is allowed (as the database/table separator). If you need to use dot (`.`) in a regular expression (e.g. `.*`), you must escape it as `\.` (in a HOCON string, write `\\.`). - Examples: `db0.\.*`, `db1.user_table_[0-9]+`, `db[1-2].(app|web)order_\.*`. - In SeaTunnel job config (HOCON string), backslashes need escaping. For example, the regex `db0.\.*` should be configured as `db0.\\.*`. - `db0.\.*` matches all tables in database `db0` (whole database synchronization). - `\.*.\.*` matches all tables in all databases (whole Hive synchronization). ### metastore_uri [string] Hive metastore uri. Supports comma-separated multiple URIs for HA/failover (whitespace is ignored). SeaTunnel passes this value to Hive `hive.metastore.uris` and uses Hive `RetryingMetaStoreClient` (if available) to retry/failover between URIs. This is client-side endpoint failover; make sure your metastores share/replicate the same backend to keep metadata consistent. ### hdfs_site_path [string] The path of `hdfs-site.xml`, used to load ha configuration of namenodes ### hive.hadoop.conf [map] Properties in hadoop conf('core-site.xml', 'hdfs-site.xml', 'hive-site.xml') ### hive.hadoop.conf-path [string] The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files ### read_partitions [list] The target partitions that user want to read from hive table, if user does not set this parameter, it will read all the data from hive table. **Tips: Every partition in partitions list should have the same directory depth. For example, a hive table has two partitions: par1 and par2, if user sets it like as the following:** **read_partitions = [par1=xxx, par1=yyy/par2=zzz], it is illegal** ### krb5_path [string] The path of `krb5.conf`, used to authentication kerberos ### kerberos_principal [string] The principal of kerberos authentication ### kerberos_keytab_path [string] The keytab file path of kerberos authentication ### read_columns [list] The read column list of the data source, user can use it to implement field projection. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ### Example 1: Single table ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://namenode001:9083" } ``` ### Example 2: Metastore URI failover ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" } ``` ### Example 3: Multiple tables > Note: Hive is a structured data source and should be use 'table_list', and 'tables_configs' will be removed in the future. > You can also set `use_regex = true` in each table config to match multiple tables. ```bash Hive { table_list = [ { table_name = "default.seatunnel_orc_1" metastore_uri = "thrift://namenode001:9083" }, { table_name = "default.seatunnel_orc_2" metastore_uri = "thrift://namenode001:9083" } ] } ``` ```bash Hive { tables_configs = [ { table_name = "default.seatunnel_orc_1" metastore_uri = "thrift://namenode001:9083" }, { table_name = "default.seatunnel_orc_2" metastore_uri = "thrift://namenode001:9083" } ] } ``` ### Example 3: Regex matching (whole database / subset) ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 1) Whole database: all tables in database `a` table_name = "a.\\.*" use_regex = true } ``` ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 2) Whole Hive: all tables in all databases table_name = "\\.*.\\.*" use_regex = true } ``` ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 3) Subset: tables matching `tmp_.*` in database `a` # Note: escape the dot wildcard as `\.` (in HOCON string, write `\\.`) because unescaped dots are treated as separators table_name = "a.tmp_\\.*" use_regex = true } ``` ### Example 4 : Kerberos ```bash source { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive.hadoop.conf-path = "/tmp/hadoop" plugin_output = hive_source hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` Description: - `hive_site_path`: The path to the `hive-site.xml` file. - `kerberos_principal`: The principal for Kerberos authentication. - `kerberos_keytab_path`: The keytab file path for Kerberos authentication. - `krb5_path`: The path to the `krb5.conf` file used for Kerberos authentication. Run the case: ```bash env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive.hadoop.conf-path = "/tmp/hadoop" plugin_output = hive_source hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } sink { Assert { plugin_input = hive_source rules { row_rules = [ { rule_type = MAX_ROW rule_value = 3 } ], field_rules = [ { field_name = pk_id field_type = bigint field_value = [ { rule_type = NOT_NULL } ] }, { field_name = name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = score field_type = int field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ## Hive on s3 ### Step 1 Create the lib dir for hive of emr. ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 2 Get the jars from maven center to the lib. ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### Step 3 Copy the jars from your environment on emr to the lib dir. ```shell cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 4 Run the case. ```shell env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "test_hive.test_hive_sink_on_s3" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } read_columns = ["pk_id", "name", "score"] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_s3_sink" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } } ``` ## Hive on oss ### Step 1 Create the lib dir for hive of emr. ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### Step 2 Get the jars from maven center to the lib. ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### Step 3 Copy the jars from your environment on emr to the lib dir and delete the conflicting jar. ```shell cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar ``` ### Step 4 Run the case. ```shell env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "test_hive.test_hive_sink_on_oss" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } sink { Hive { table_name = "test_hive.test_hive_sink_on_oss_sink" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/HiveJdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # HiveJdbc > JDBC Hive Source Connector ## Support Hive Version - Definitely supports 3.1.3 and 3.1.2, other versions need to be tested. ## Timeout Parameter Support The `socket_timeout_ms` and `connect_timeout_ms` parameters are tested with **Hive 3.2.0+**. For earlier versions (including 3.1.x), these parameters have not been verified yet. The parameters will be passed to the JDBC driver, but their effectiveness depends on the Hive version being used. ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Description Read external data source data through JDBC. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|---------------------------------|--------------------------------------|--------------------------------------------------------------------------| | Hive | Different dependency version has different driver class. | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000/default | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' > working directory
> For example Hive datasource: cp hive-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Hive Data Type | SeaTunnel Data Type | |-------------------------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | TINYINT
SMALLINT | SHORT | | INT
INTEGER | INT | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE
DOUBLE PRECISION | DOUBLE | | DECIMAL(x,y)
NUMERIC(x,y)
(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)
NUMERIC(x,y)
(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | CHAR
VARCHAR
STRING | STRING | | DATE | DATE | | DATETIME
TIMESTAMP | TIMESTAMP | | BINARY
ARRAY
INTERVAL
MAP
STRUCT
UNIONTYPE | Not supported yet | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:hive2://localhost:10000/default | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Hive the value is `org.apache.hive.jdbc.HiveDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | socket_timeout_ms | Int | No | 86400000 | Socket timeout in milliseconds for reading data from the server. Set to 0 for no timeout. Note: Tested with Hive 3.2.0+. For earlier versions, not yet verified. | | connect_timeout_ms | Int | No | 86400000 | Connection timeout in milliseconds for establishing connection to the server. Set to 0 for no timeout. Note: Tested with Hive 3.2.0+. For earlier versions, not yet verified. | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | | use_kerberos | Boolean | No | no | Whether to enable Kerberos, default is false | | kerberos_principal | String | No | - | When use kerberos, we should set kerberos principal such as 'test_user@xxx'. | | kerberos_keytab_path | String | No | - | When use kerberos, we should set kerberos principal file path such as '/home/test/test_user.keytab' . | | krb5_path | String | No | /etc/krb5.conf | When use kerberos, we should set krb5 path file path such as '/seatunnel/krb5.conf' or use the default path '/etc/krb5.conf '. | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed > in parallel according to the concurrency of tasks , When your shard read field is a large number type such as bigint( > and above and the data is not evenly distributed, it is recommended to set the parallelism level to 1 to ensure that > the > data skew problem is resolved ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its > fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 query = "select * from type_bin limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want > to read the whole table ``` source { Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 # Define query logic as required query = "select * from type_bin" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read > your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Http.md ================================================ import ChangeLog from '../changelog/connector-http.md'; # Http > Http source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Used to read data from Http. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) Supported DataSource Info ------------------------- In order to use the Http connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|------------------------------------------------------------------------------------| | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-http) | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------|---------|----------|-------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url. | | schema | Config | No | - | Http and seatunnel data structure mapping. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | schema.fields | Config | No | - | The schema fields of upstream data | | json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. | | pageing | Config | No | - | This parameter is used for paging queries | | pageing.page_field | String | No | - | This parameter is used to specify the page field name in the request. It can be used in headers, params, or body with placeholders like ${page_field}. | | pageing.use_placeholder_replacement | Boolean | No | false | If true, use placeholder replacement (${field}) for headers, parameters and body values, otherwise use key-based replacement. | | pageing.total_page_size | Int | No | - | This parameter is used to control the total number of pages | | pageing.batch_size | Int | No | - | The batch size returned per request is used to determine whether to continue when the total number of pages is unknown | | pageing.start_page_number | Int | No | 1 | Specify the page number from which synchronization starts | | pageing.page_type | String | No | PageNumber | this parameter is used to specify the page type ,or PageNumber if not set, only support `PageNumber` and `Cursor`. | | pageing.cursor_field | String | No | - | this parameter is used to specify the Cursor field name in the request parameter. | | pageing.cursor_response_field | String | No | - | This parameter specifies the field in the response from which the cursor is retrieved. | | content_field | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. | | format | String | No | text | The format of upstream data, now only support `json` `text`, default `text`. | | method | String | No | get | Http request method, only supports GET, POST method. | | headers | Map | No | - | Http headers. | | params | Map | No | - | Http params. | | body | String | No | - | Http body,the program will automatically add http header application/json,body is jsonbody. | | poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. | | retry | Int | No | - | The max retry times if request http return to `IOException`. | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | enable_multi_lines | Boolean | No | false | | | connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | | socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | | keep_params_as_form | Boolean | No | false | Whether the params are submitted according to the form, used for compatibility with legacy behaviors. When true, the value of the params parameter is submitted through the form. | | keep_page_param_as_http_param | Boolean | No | false | Whether to set the paging parameters to params. For compatibility with legacy behaviors.| | json_filed_missed_return_null | Boolean | No | false | When the json field is missing, set true return null else error.| ## How to Create a Http Data Synchronization Jobs ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Http { plugin_output = "http" url = "http://mockserver:1080/example/http" method = "GET" format = "json" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } } # Console printing of the read Http data sink { Console { parallelism = 1 } } ``` ## Parameter Interpretation ### format when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### keep_params_as_form For compatibility with old versions of http. When set to true,`` and `` will be submitted in the form. When set to false,`` will be added to the url path,and `` will not be added to the body or form. It will replace placeholders in params and body. ### keep_page_param_as_http_param Whether to set the paging parameters to params. When set to true,`` is set to ``. When set to false,When the page field exists in `` or ``, replace value. When set to false,config example: ```hocon body="""{"id":1,"page":"${page}"}""" ``` ```hocon params={ page: "${page}" } ``` ### params By default, the parameters will be added to the url path. If you need to keep the old version behavior, please check keep_params_as_form. ### body The HTTP body is used to carry the actual data in requests or responses, including JSON, form submissions. The reference format is as follows: ```hocon body="{"id":1,"name":"seatunnel"}" ``` For form submissions,please set the content-type as follows. ```hocon headers { Content-Type = "application/x-www-form-urlencoded" } ``` ### content_field This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### pageing The current supported pagination type are `PageNumber` and `Cursor`. if you need to use pagination, you need to configure `pageing`. the default pagination type is `PageNumber`. #### 1. PageNumber When using `PageNumber` pagination, you can include page parameters in different parts of your HTTP request: - **In URL parameters**: Add the page parameter to the `params` section - **In request body**: Include the page parameter in the `body` JSON - **In headers**: Add the page parameter to the `headers` section You can use placeholders like `${page}` with `use_placeholder_replacement = true` to dynamically update these values. The placeholders can be used in various formats: - As a standalone value: `"${page}"` - With prefix/suffix: `"10${page}"` or `"page-${page}"` - As a number without quotes: `${page}` (in JSON body) - In nested JSON structures: `{"pagination":{"page":${page}}}` ##### Example 1: Using page parameters in body and params ```hocon source { Http { url = "http://localhost:8080/mock/queryData" method = "POST" format = "json" body="""{"id":1,"page":"${page}"}""" content_field = "$.data.*" params={ page: "${page}" } pageing={ #you can not set this parameter ,the default value is PageNumber page_type="PageNumber" total_page_size=20 page_field=page use_placeholder_replacement=true #when don't know the total_page_size use batch_size if read size ================================================ FILE: docs/en/connectors/source/Iceberg.md ================================================ import ChangeLog from '../changelog/connector-iceberg.md'; # Apache Iceberg > Apache Iceberg source connector ## Support Iceberg Version - 1.6.1 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] data format - [x] parquet - [x] orc - [x] avro - [x] iceberg catalog - [x] hadoop(2.7.1 , 2.7.5 , 3.1.3) - [x] hive(2.3.9 , 3.1.2) ## Description Source connector for Apache Iceberg. It can support batch and stream mode. ## Supported DataSource Info | Datasource | Dependent | Maven | |------------|-----------|---------------------------------------------------------------------------| | Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## Database Dependency > In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages. If you are using the hadoop s3 catalog, you need to add the hadoop-aws,aws-java-sdk jars for your Flink and Spark engine versions. (Additional locations: /lib, /jars) ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package. ## Data Type Mapping | Iceberg Data type | SeaTunnel Data type | |-------------------|---------------------| | BOOLEAN | BOOLEAN | | INTEGER | INT | | LONG | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | STRING | | FIXED
BINARY | BYTES | | DECIMAL | DECIMAL | | STRUCT | ROW | | LIST | ARRAY | | MAP | MAP | ## Source Options | Name | Type | Required | Default | Description | |--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | yes | - | User-specified catalog name. | | namespace | string | yes | - | The iceberg database name in the backend catalog. | | table | string | no | - | The iceberg table name in the backend catalog. | | table_list | string | no | - | The iceberg table list in the backend catalog. | | iceberg.catalog.config | map | yes | - | Specify the properties for initializing the Iceberg catalog, which can be referenced in this file: [CatalogProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java) | | hadoop.config | map | no | - | Properties passed through to the Hadoop configuration | | iceberg.hadoop-conf-path | string | no | - | The specified loading paths for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files. | | schema | config | no | - | Use projection to select data columns and columns order. | | case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. | | start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp.
timestamp – the timestamp in millis since the Unix epoch | | start_snapshot_id | long | no | - | Instructs this scan to look for changes starting from a particular snapshot (exclusive). | | end_snapshot_id | long | no | - | Instructs this scan to look for changes up to a particular snapshot (inclusive). | | use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. | | use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch | | stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are:
TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.
FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.
FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.
FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.
FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. | | increment.scan-interval | long | no | 2000 | The interval of increment scan(mills) | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | | query | String | no | - | The select DML to select the iceberg data. It mustn't contain the table name, and doesn't support alias. For example: `select * from table where f1 > 100`, `select fn from table where f1 > 100`. The current support for the LIKE syntax is limited: the LIKE clause shouldn't start with `%`. The supported one is: `select f1 from t where f2 like 'tom%' ` | ## Task Example ### Simple ```hocon env { parallelism = 2 job.mode = "BATCH" } source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hadoop" warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" } namespace = "database1" table = "source" query = "select fn from table where f1 > 100" plugin_output = "iceberg" } } transform { } sink { Console { plugin_input = "iceberg" } } ``` ### Multi-Table Read ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config = { type = "hadoop" warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" } namespace = "database1" table_list = [ { table = "table_1 }, { table = "table_2 query = "select fn from table where f1 > 100" } ] plugin_output = "iceberg" } } ``` ### Hadoop S3 Catalog ```hocon source { iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ "type"="hadoop" "warehouse"="s3a://your_bucket/spark/warehouse/" } hadoop.config={ "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" "fs.s3a.endpoint" = "s3.cn-north-1.amazonaws.com.cn" "fs.s3a.access.key" = "xxxxxxxxxxxxxxxxx" "fs.s3a.secret.key" = "xxxxxxxxxxxxxxxxx" "fs.defaultFS" = "s3a://your_bucket" } namespace = "your_iceberg_database" table = "your_iceberg_table" plugin_output = "iceberg_test" } } ``` ### Hive Catalog ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hive" uri = "thrift://localhost:9083" warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" } catalog_type = "hive" namespace = "your_iceberg_database" table = "your_iceberg_table" } } ``` ### Column Projection ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hadoop" warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" } namespace = "your_iceberg_database" table = "your_iceberg_table" schema { fields { f2 = "boolean" f1 = "bigint" f3 = "int" f4 = "bigint" } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/InfluxDB.md ================================================ import ChangeLog from '../changelog/connector-influxdb.md'; # InfluxDB > InfluxDB source connector ## Description Read external data source data through InfluxDB. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) supports query SQL and can achieve projection effect. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------------|--------|----------|---------------| | url | string | yes | - | | sql | string | yes | - | | schema | config | yes | - | | database | string | yes | | | username | string | no | - | | password | string | no | - | | lower_bound | long | no | - | | upper_bound | long | no | - | | partition_num | int | no | - | | split_column | string | no | - | | epoch | string | no | n | | connect_timeout_ms | long | no | 15000 | | query_timeout_sec | int | no | 3 | | common-options | config | no | - | ### url the url to connect to influxDB e.g. ``` http://influxdb-host:8086 ``` ### sql [string] The query sql used to search data ``` select name,age from test ``` ### schema [config] #### fields [Config] The schema information of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). e.g. ``` schema { fields { name = string age = int } } ``` ### database [string] The `influxDB` database ### username [string] the username of the influxDB when you select ### password [string] the password of the influxDB when you select ### split_column [string] the `split_column` of the influxDB when you select > Tips: > - influxDB tags is not supported as a segmented primary key because the type of tags can only be a string > - influxDB time is not supported as a segmented primary key because the time field cannot participate in mathematical calculation > - Currently, `split_column` only supports integer data segmentation, and does not support `float`, `string`, `date` and other types. ### upper_bound [long] upper bound of the `split_column`column ### lower_bound [long] lower bound of the `split_column` column ``` split the $split_column range into $partition_num parts if partition_num is 1, use the whole `split_column` range if partition_num < (upper_bound - lower_bound), use (upper_bound - lower_bound) partitions eg: lower_bound = 1, upper_bound = 10, partition_num = 2 sql = "select * from test where age > 0 and age < 10" split result split 1: select * from test where ($split_column >= 1 and $split_column < 6) and ( age > 0 and age < 10 ) split 2: select * from test where ($split_column >= 6 and $split_column < 11) and ( age > 0 and age < 10 ) ``` ### partition_num [int] the `partition_num` of the InfluxDB when you select > Tips: Ensure that `upper_bound` minus `lower_bound` is divided `bypartition_num`, otherwise the query results will overlap ### epoch [string] returned time precision - Optional values: H, m, s, MS, u, n - default value: n ### query_timeout_sec [int] the `query_timeout` of the InfluxDB when you select, in seconds ### connect_timeout_ms [long] the timeout for connecting to InfluxDB, in milliseconds ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Examples Example of multi parallelism and multi partition scanning ```hocon source { InfluxDB { url = "http://influxdb-host:8086" sql = "select label, value, rt, time from test" database = "test" upper_bound = 100 lower_bound = 1 partition_num = 4 split_column = "value" schema { fields { label = STRING value = INT rt = STRING time = BIGINT } } } ``` Example of not using partition scan ```hocon source { InfluxDB { url = "http://influxdb-host:8086" sql = "select label, value, rt, time from test" database = "test" schema { fields { label = STRING value = INT rt = STRING time = BIGINT } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/IoTDB.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to read data from IoTDB. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) > IoTDB allows column projection using SQL query. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Supported DataSource Info | Datasource | Supported Versions | Url | |------------|------------------------------|----------------| | IoTDB | `0.13.0 <= version <= 1.3.X` | localhost:6667 | ## Data Type Mapping | IotDB Data Type | SeaTunnel Data Type | |-----------------|---------------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | ## Source Options | Name | Type | Required | Default Value | Description | |----------------------------|---------|----------|---------------|-------------------------------------------------------------------------------------------------------------------| | node_urls | string | yes | - | IoTDB cluster address, the format is `"host1:port"` or `"host1:port,host2:port"` | | username | string | yes | - | IoTDB user username | | password | string | yes | - | IoTDB user password | | sql | string | yes | - | execute sql statement | | schema | config | yes | - | The data schema. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | fetch_size | int | no | - | the fetch_size of the IoTDB when you select | | lower_bound | long | no | - | the lower_bound of the IoTDB when you select | | upper_bound | long | no | - | the upper_bound of the IoTDB when you select | | num_partitions | int | no | - | the num_partitions of the IoTDB when you select | | thrift_default_buffer_size | int | no | - | the thrift_default_buffer_size of the IoTDB when you select | | thrift_max_frame_size | int | no | - | the thrift max frame size | | enable_cache_leader | boolean | no | - | enable_cache_leader of the IoTDB when you select | | version | string | no | - | SQL semantic version used by the client, The possible values are: `V_0_12`, `V_0_13` | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | We can use time column as a partition key in SQL queries. #### num_partitions [int] the number of partitions ### upper_bound [long] the upper bound of the time range ### lower_bound [long] the lower bound of the time range ``` split the time range into numPartitions parts if numPartitions = 1, the whole time range will be used if numPartitions < (upper_bound - lower_bound), will use (upper_bound - lower_bound) as numPartitions eg: lower_bound = 1, upper_bound = 10, numPartitions = 2 sql = "select * from test where age > 0 and age < 10" split result: split 1: select * from test where (time >= 1 and time < 6) and ( age > 0 and age < 10 ) split 2: select * from test where (time >= 6 and time < 11) and ( age > 0 and age < 10 ) ``` ## Examples ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = "localhost:6667" username = "root" password = "root" sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device" schema { fields { ts = timestamp device_name = string temperature = float moisture = bigint c_int = int c_bigint = bigint c_float = float c_double = double c_string = string c_boolean = boolean } } } } sink { Console { } } ``` The data format from upstream IoTDB is as follows: ```shell IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device; +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ | Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ ``` The data format loaded to SeaTunnelRow is as follows: | ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean | |---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------| | 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true | | 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true | | 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true | ## Changelog ================================================ FILE: docs/en/connectors/source/IoTDBv2.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Description Used to read data from IoTDB. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) > IoTDB allows column projection using SQL query. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Supported DataSource Info | Datasource | Supported Versions | Url | |------------|--------------------|----------------| | IoTDB | `2.0 <= version` | localhost:6667 | ## Data Type Mapping | IotDB Data Type | SeaTunnel Data Type | |-----------------|---------------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | | STRING | STRING | | TIMESTAMP | BIGINT | | TIMESTAMP | TIMESTAMP | | BLOB | STRING | | DATE | DATE | ## Source Options | Name | Type | Required | Default Value | Description | |----------------------------|---------|----------|---------------|-------------------------------------------------------------------------------------------------------------------| | node_urls | Array | Yes | - | IoTDB cluster address, the format is `["host1:port"]` or `["host1:port","host2:port"]` | | username | String | Yes | - | IoTDB username | | password | String | Yes | - | IoTDB user password | | sql_dialect | String | No | tree | The sql dialect of IoTDB, options available is `"tree"` or `"table"` | | database | String | No | - | The database selected (only valid when `sql_dielct` is `"table"`) | | sql | String | Yes | - | The sql statement to be executed | | schema | Config | Yes | - | The data schema. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | fetch_size | Integer | No | - | The fetch_size of the IoTDB when you select | | lower_bound | Long | No | - | The lower_bound of the IoTDB when you select | | upper_bound | Long | No | - | The upper_bound of the IoTDB when you select | | num_partitions | Integer | No | - | The num_partitions of the IoTDB when you select | | default_thrift_buffer_size | Integer | No | - | The thrift_default_buffer_size of the IoTDB when you select | | max_thrift_frame_size | Integer | No | - | The thrift max frame size | | enable_cache_leader | Boolean | No | - | Enable_cache_leader of the IoTDB when you select | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | We can use time column as a partition key in SQL queries. #### num_partitions [int] the number of partitions ### upper_bound [long] the upper bound of the time range ### lower_bound [long] the lower bound of the time range ``` split the time range into numPartitions parts if numPartitions = 1, the whole time range will be used if numPartitions < (upper_bound - lower_bound), will use (upper_bound - lower_bound) as numPartitions eg: lower_bound = 1, upper_bound = 10, numPartitions = 2 sql = "select * from test where age > 0 and age < 10" split result: split 1: select * from test where (time >= 1 and time < 6) and ( age > 0 and age < 10 ) split 2: select * from test where (time >= 6 and time < 11) and ( age > 0 and age < 10 ) ``` ## Examples ### Example 1: Read data from IoTDB-tree ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device" schema { fields { ts = timestamp device_name = string temperature = float moisture = bigint c_int = int c_bigint = bigint c_float = float c_double = double c_string = string c_boolean = boolean } } } } sink { Console { } } ``` The data format from upstream IoTDB is as follows: ```shell IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device; +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ | Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ ``` The data format loaded to SeaTunnelRow is as follows: | ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean | |---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------| | 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true | | 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true | | 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true | ### Example 2:Read data from IoTDB-table ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" database = "test_database" sql = "SELECT time, sn, type, bidprice, bidsize, domain, buyno, askprice FROM test_table" schema { fields { ts = timestamp sn = string type = string bidprice = int bidsize = double domain = boolean buyno = bigint askprice = string } } } } sink { Console { } } ``` > If database is specified in SQL query, the `database` option is not required. The data format from upstream IoTDB is as follows: ```shell IoTDB> SELECT time, sn, type, bidprice, bidsize, domain, buyno, askprice FROM test_table +-----------------------------+------+----+--------+------------------+------+-----+-----------+ | time| sn|type|bidprice| bidsize|domain|buyno| askprice| +-----------------------------+------+----+--------+------------------+------+-----+-----------+ |2025-07-30T17:52:34.851+08:00|0700HK| L1| 9|10.323907796459721| true| 10|-1064754527| |2025-07-30T17:52:34.951+08:00|0700HK| L1| 10| 9.844574317657585| false| 9|-1088662576| |2025-07-30T17:52:35.051+08:00|0700HK| L1| 9| 9.272974132434069| true| 9| 402003616| +-----------------------------+------+----+--------+------------------+------+-----+-----------+ ``` The data format loaded to SeaTunnelRow is as follows: | ts | sn | type | bidprice | bidsize | domain | buyno | askprice | |-------------------------|--------|------|----------|--------------------|--------|-------|-------------| | 2025-07-30T17:52:34.851 | 0700HK | L1 | 9 | 10.323907796459721 | true | 10 | -1064754527 | | 2025-07-30T17:52:34.951 | 0700HK | L1 | 10 | 9.844574317657585 | false | 9 | -1088662576 | | 2025-07-30T17:52:35.051 | 0700HK | L1 | 9 | 9.272974132434069 | true | 9 | 402003616 | ## Changelog ================================================ FILE: docs/en/connectors/source/Jdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # JDBC > JDBC source connector ## Description Read external data source data through JDBC. :::tip Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATUNNEL_HOME/lib/` directory in order to make them work. e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATUNNEL_HOME/lib/`. For Spark/Flink, you should also copy it to `$SPARK_HOME/jars/` or `$FLINK_HOME/lib/`. ::: ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) supports query SQL and can achieve projection effect. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table read](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | description | |--------------------------------------------|---------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | | username | String | No | - | userName | | password | String | No | - | password | | query | String | No | - | Query statement | | compatible_mode | String | No | - | The compatible mode of database, required when the database supports multiple compatible modes.
For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'.
when using starrocks, you need set it to `starrocks` | | dialect | String | No | - | The appointed dialect, if it does not exist, is still obtained according to the url, and the priority is higher than the url.
For example,when using starrocks, you need set it to `starrocks` | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | partition_column | String | No | - | The column name for split data. | | partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_num | Int | No | job parallelism | Not recommended for use, The correct approach is to control the number of split through `split.size`
**Note:** This parameter takes effect only when using the `query` parameter. It does not take effect when using the `table_path` parameter. | | decimal_type_narrowing | Boolean | No | true | Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. Please refer to `decimal_type_narrowing` below | | int_type_narrowing | Boolean | No | true | Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. Please refer to `int_type_narrowing` below | | handle_blob_as_string | Boolean | No | false | If true, BLOB type will be converted to STRING type. **Only supported for Oracle database**. This is useful for handling large BLOB fields in Oracle that exceed the default size limit. When transmitting Oracle's BLOB fields to systems like Doris, setting this to true can make the data transfer more efficient. | | use_select_count | Boolean | No | false | Use select count for table count rather then other methods in dynamic chunk split stage. This is currently only available for jdbc-oracle.In this scenario, select count directly is used when it is faster to update statistics using sql from analysis table | | skip_analyze | Boolean | No | false | Skip the analysis of table count in dynamic chunk split stage. This is currently only available for jdbc-oracle.In this scenario, you schedule analysis table sql to update related table statistics periodically or your table data does not change frequently | | use_regex | Boolean | No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
examples:
`- mysql: "testdb.table1" `
`- oracle: "test_schema.table1" `
`- sqlserver: "testdb.test_schema.table1"`
`- postgresql: "testdb.test_schema.table1"`
`- iris: "test_schema.table1"` | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | How many rows in one split, captured tables are split into multiple splits when read of table. **Note**: This parameter takes effect only when using the `table_path` parameter. It does not take effect when using the `query` parameter. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | Not recommended for use.
The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | Not recommended for use.
The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | | split.string_split_mode | String | No | sample | Supports different string splitting algorithms. By default, `sample` is used to determine the split by sampling the string value. You can switch to `charset_based` to enable charset-based string splitting algorithm. When set to `charset_based`, the algorithm assumes characters of partition_column are within ASCII range 32-126, which covers most character-based splitting scenarios. | | split.string_split_mode_collate | String | No | - | Specifies the collation to use when string_split_mode is set to `charset_based` and the table has a special collation. If not specified, the database's default collation will be used. | ### Table Matching The JDBC Source connector supports two ways to specify tables: #### Notes - Many JDBC drivers treat `DatabaseMetaData.getColumns(..., schemaPattern, tableNamePattern, ...)` as SQL LIKE patterns. If your schema/table names contain `_` or `%`, column discovery may return rows from other tables. SeaTunnel filters the returned metadata rows by exact schema/table identifier to avoid mixing columns. - For case-sensitive databases, make sure the configured schema/table names use the exact identifier case. 1. **Exact Table Path**: Use `table_path` to specify a single table with its full path. ```hocon table_path = "testdb.table1" ``` 2. **Regular Expression**: Use `table_path` with a regex pattern to match multiple tables. ```hocon table_path = "testdb.table\\d+" # Matches table1, table2, table3, etc. use_regex = true ``` #### Regular Expression Support for Table Names The JDBC connector supports using regular expressions to match multiple tables. This feature allows you to process multiple tables with a single source configuration. #### Configuration To use regular expression matching for table paths: 1. Set `use_regex = true` to enable regex matching 2. If `use_regex` is not set or set to `false`, the connector will treat the table_path as an exact path (no regex matching) #### Regular Expression Syntax Notes - **Path Separator**: The dot (`.`) is treated as a separator between database, schema, and table names. - **Escaped Dots**: If you need to use a dot (`.`) as a wildcard character in your regular expression to match any character, you must escape it with a backslash (`\.`). - **Path Format**: For paths like `database.table` or `database.schema.table`, the last unescaped dot separates the table pattern from the database/schema pattern. - **Pattern Examples**: - `test.table\\d+` - Matches tables like `table1`, `table2`, etc. in the `test` database - `test.*` - Matches all tables in the `test` database (for whole database synchronization) - `postgres.public.test_db_\.*` - Matches all tables that start with `test_db_` in the `public` schema of the `postgres` database #### Example ```hocon source { Jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "password" table_list = [ { # Regex matching - match any table in test database table_path = "test.*" use_regex = true }, { # Regex matching - match tables with "user" followed by digits table_path = "test.user\\d+" use_regex = true }, { # Exact matching - simple table name table_path = "test.config" # use_regex not specified, defaults to false }, ] } } ``` #### Multi-table Synchronization When using either regular expressions, the connector will read data from all matching tables. Each table will be processed independently, and the data will be combined in the output. Example configuration for multi-table synchronization: ```hocon Jdbc { url = "jdbc:mysql://localhost/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" # Using regular expression with explicit configuration table_list = [ { table_path = "testdb.table\\d+" use_regex = true } ] } ``` ### decimal_type_narrowing Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. eg: decimal_type_narrowing = true | Oracle | SeaTunnel | |---------------|-----------| | NUMBER(1, 0) | Boolean | | NUMBER(6, 0) | INT | | NUMBER(10, 0) | BIGINT | decimal_type_narrowing = false | Oracle | SeaTunnel | |---------------|----------------| | NUMBER(1, 0) | Decimal(1, 0) | | NUMBER(6, 0) | Decimal(6, 0) | | NUMBER(10, 0) | Decimal(10, 0) | ### int_type_narrowing Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. eg: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ### dialect [string] The appointed dialect, if it does not exist, is still obtained according to the url, and the priority is higher than the url. For example,when using starrocks, you need set it to `starrocks`. Similarly, when using mysql, you need to set its value to `mysql`. If one dialect not supported by SeaTunnel, it will use the default dialect `GenericDialect`. Just make sure the driver you provided support the database you want to connect. #### dialect list | | Dialect Name | | |-----------|--------------|----------| | Greenplum | DB2 | Dameng | | Gbase8a | HIVE | KingBase | | MySQL | StarRocks | Oracle | | Phoenix | Postgres | Redshift | | SapHana | Snowflake | Sqlite | | SqlServer | Tablestore | Teradata | | Vertica | OceanBase | XUGU | | IRIS | Inceptor | Highgo | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. > > When inferring a primary key based on a `query`, the key is inherited from the underlying table where the first column in the result set is located, and its strictness for the overall join result set is not guaranteed (for example, when the query contains joins or reads from multiple tables). ## appendix there are some reference value for params above. | datasource | driver | url | maven | |-------------------|-----------------------------------------------------|------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| | mysql | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | postgresql | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | https://mvnrepository.com/artifact/org.postgresql/postgresql | | dm | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | | phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | | sqlserver | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | | oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | | sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | | gbase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar | | starrocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | | tablestore | com.alicloud.openservices.tablestore.jdbc.OTSDriver | "jdbc:ots:http s://myinstance.cn-hangzhou.ots.aliyuncs.com/myinstance" | https://mvnrepository.com/artifact/com.aliyun.openservices/tablestore-jdbc | | saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | | doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | | Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | | Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb?defaultRowFetchSize=1000 | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | | Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | | Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | | OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar | | Hive | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000 | https://repo1.maven.org/maven2/org/apache/hive/hive-jdbc/3.1.3/hive-jdbc-3.1.3-standalone.jar | | xugu | com.xugu.cloudjdbc.Driver | jdbc:xugu://localhost:5138 | https://repo1.maven.org/maven2/com/xugudb/xugu-jdbc/12.2.0/xugu-jdbc-12.2.0.jar | | InterSystems IRIS | com.intersystems.jdbc.IRISDriver | jdbc:IRIS://localhost:1972/%SYS | https://raw.githubusercontent.com/intersystems-community/iris-driver-distribution/main/JDBC/JDK18/intersystems-jdbc-3.8.4.jar | | opengauss | org.opengauss.Driver | jdbc:opengauss://localhost:5432/postgres | https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/5.1.0-og/opengauss-jdbc-5.1.0-og.jar | | Highgo | com.highgo.jdbc.Driver | jdbc:highgo://localhost:5866/highgo | https://repo1.maven.org/maven2/com/highgo/HgdbJdbc/6.2.3/HgdbJdbc-6.2.3.jar | | Presto | com.facebook.presto.jdbc.PrestoDriver | jdbc:presto://localhost:8080/presto | https://repo1.maven.org/maven2/com/facebook/presto/presto-jdbc/0.279/presto-jdbc-0.279.jar | | Trino | io.trino.jdbc.TrinoDriver | jdbc:trino://localhost:8080/trino | https://repo1.maven.org/maven2/io/trino/trino-jdbc/460/trino-jdbc-460.jar | ## Example ### simple #### Case 1 ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" query = "select * from type_bin" } ``` #### Case 2 Use the select count(*) instead of analysis table for count table rows in dynamic chunk split stage ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" use_select_count = true query = "select * from type_bin" } ``` #### Case 3 Use the select NUM_ROWS from all_tables for the table rows but skip the analyze table. ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" skip_analyze = true query = "select * from type_bin" } ``` #### Case 4 Oracle Source with BLOB as string to Doris Sink This example demonstrates how to handle Oracle's BLOB data as strings when transferring to Doris. This is useful for large BLOB fields. ``` env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@oracle_host:1521/SERVICE_NAME" user = "username" password = "password" query = "SELECT ID, NAME, CONTENT_BLOB FROM MY_TABLE" handle_blob_as_string = true # Enable BLOB to String conversion for Oracle } } ``` ### parallel by partition_column ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" query = "select * from type_bin" partition_column = "id" partition_num = 10 # Replace split.size with partition_num # Read start boundary #partition_lower_bound = ... # Read end boundary #partition_upper_bound = ... } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query. It is more efficient to read your data source according to the upper and lower boundaries you configured. ``` source { Jdbc { url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 properties { useSSL=false } } } ``` ### parallel by Primary Key or Unique Index > Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" table_path = "testdb.table1" query = "select * from testdb.table1" split.size = 10000 } } sink { Console {} } ``` ### multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" table_list = [ { # e.g. table_path = "testdb.table1"、table_path = "test_schema.table1"、table_path = "testdb.test_schema.table1" table_path = "testdb.table1" }, { table_path = "testdb.table2" # Use query filter rows & columns query = "select id, name from testdb.table2 where id > 100" }, { # Using regex to match multiple tables table_path = "testdb.user_table\\d+" use_regex = true } ] #where_condition= "where id > 100" #split.size = 10000 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Jira.md ================================================ import ChangeLog from '../changelog/connector-http-jira.md'; # Jira > Jira source connector ## Description Used to read data from Jira. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | email | String | Yes | - | | api_token | String | Yes | - | | method | String | No | get | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### email [String] Jira Email ### api_token [String] Jira API Token https://id.atlassian.com/manage-profile/security/api-tokens ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Jira { url = "https://liugddx.atlassian.net/rest/api/3/search" email = "test@test.com" api_token = "xxx" schema { fields { expand = string startAt = bigint maxResults = int total = int } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Kafka.md ================================================ import ChangeLog from '../changelog/connector-kafka.md'; # Kafka > Kafka source connector ## Support Those Engines > Spark
> Flink
> Seatunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Source connector for Apache Kafka. ## Supported DataSource Info In order to use the Kafka connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Maven | |------------|--------------------|-------------------------------------------------------------------------------------| | Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-kafka) | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------|----------------------------------------------------------------------------|----------|--------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | String | Yes | - | Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by comma like 'topic-1,topic-2'. | | table_list | Map | No | - | Topic list config You can configure only one `table_list` and one `topic` at the same time | | bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | | pattern | Boolean | No | false | If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer. | | consumer.group | String | No | SeaTunnel-Consumer-Group | `Kafka consumer group id`, used to distinguish different consumer groups. | | commit_on_checkpoint | Boolean | No | true | If true the consumer's offset will be periodically committed in the background. | | poll.timeout | Long | No | 10000 | The interval(millis) for poll messages. | | kafka.config | Map | No | - | In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs). | | schema | Config | No | - | The structure of the data, including field names and field types. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | format | String | No | json | Data format. The default format is json. Optional text format, canal_json, debezium_json, maxwell_json, ogg_json, avro , protobuf and native. If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. Some format details please refer [formats](../formats) | | format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | | debezium_record_table_filter | Config | No | - | Used for filtering data in debezium format, only when the format is set to `debezium_json`. Please refer `debezium_record_table_filter` below | | field_delimiter | String | No | , | Customize the field delimiter for data format. | | start_mode | StartMode[earliest],[group_offsets],[latest],[specific_offsets],[timestamp] | No | group_offsets | The initial consumption pattern of consumers. | | start_mode.offsets | Config | No | - | The offset required for consumption mode to be specific_offsets. | | start_mode.timestamp | Long | No | - | The time required for consumption mode to be "timestamp". | | start_mode.end_timestamp | Long | No | - | The end time required for consumption mode to be "timestamp" in batch mode | partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. | | ignore_no_leader_partition | Boolean | No | false | Whether to ignore partitions that have no leader. If set to true, partitions without a leader will be skipped during partition discovery. If set to false (default), the connector will include all partitions regardless of leader status. This is useful when dealing with Kafka clusters that may have temporary leadership issues. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | | protobuf_message_name | String | No | - | Effective when the format is set to protobuf, specifies the Message name | | protobuf_schema | String | No | - | Effective when the format is set to protobuf, specifies the Schema definition | | strip_schema_registry_header | Boolean | No | false | Effective when the format is set to protobuf. Whether to strip the Confluent Schema Registry wire format header (magic byte, schema id and message indexes) before protobuf deserialization. This option is useful when consuming Protobuf messages that were encoded using Confluent Schema Registry. When enabled, the connector will try to detect and remove the Schema Registry header before parsing the Protobuf message. If the header is not detected, it will fall back to standard Protobuf deserialization. | | reader_cache_queue_size | Integer | No | 1024 | The reader shard cache queue is used to cache the data corresponding to the shards. The size of the shard cache depends on the number of shards obtained by each reader, rather than the amount of data in each shard. | | is_native | Boolean | No | false | Supports retaining the source information of the record. ### debezium_record_table_filter We can use `debezium_record_table_filter` to filter the data in the debezium format. The configuration is as follows: ```hocon debezium_record_table_filter { database_name = "test" // null if not exists schema_name = "public" // null if not exists table_name = "products" } ``` Only the data of the `test.public.products` table will be consumed. ## Metadata Support The Kafka source automatically injects `ConsumerRecord.timestamp` into the SeaTunnel `EventTime` metadata when the value is non-negative. You can expose it as a normal field through the [Metadata transform](../../transforms/metadata.md) for downstream SQL or partitioning. ```hocon source { Kafka { plugin_output = "kafka_raw" topic = "seatunnel_topic" bootstrap.servers = "localhost:9092" format = json } } transform { Metadata { plugin_input = "kafka_raw" plugin_output = "kafka_with_meta" metadata_fields { EventTime = kafka_ts # kafka_ts will contain ConsumerRecord.timestamp (ms) } } Sql { plugin_input = "kafka_with_meta" plugin_output = "kafka_enriched" query = "select *, FROM_UNIXTIME(kafka_ts/1000, 'yyyy-MM-dd', 'Asia/Shanghai') as pt from kafka_with_meta where kafka_ts >= 0" } } ``` ## Task Example ### Simple > This example reads the data of kafka's topic_1, topic_2, topic_3 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. > In batch mode, during the enumerator sharding process, it will fetch the latest offset for each partition and use it as the stopping point. ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source { Kafka { schema = { fields { name = "string" age = "int" } } format = text field_delimiter = "#" topic = "topic_1,topic_2,topic_3" bootstrap.servers = "localhost:9092" kafka.config = { client.id = client_1 max.poll.records = 500 auto.offset.reset = "earliest" enable.auto.commit = "false" } } } sink { Console {} } ``` ### Regex Topic ```hocon source { Kafka { topic = ".*seatunnel*." pattern = "true" bootstrap.servers = "localhost:9092" consumer.group = "seatunnel_group" } } ``` ### AWS MSK SASL/SCRAM Replace the following `${username}` and `${password}` with the configuration values in AWS MSK. ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "xx.amazonaws.com.cn:9096,xxx.amazonaws.com.cn:9096,xxxx.amazonaws.com.cn:9096" consumer.group = "seatunnel_group" kafka.config = { security.protocol=SASL_SSL sasl.mechanism=SCRAM-SHA-512 sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" #security.protocol=SASL_SSL #sasl.mechanism=AWS_MSK_IAM #sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" #sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } } ``` ### AWS MSK IAM Download `aws-msk-iam-auth-1.1.5.jar` from https://github.com/aws/aws-msk-iam-auth/releases and put it in `$SEATUNNEL_HOME/plugin/kafka/lib` dir. Please ensure the IAM policy have `"kafka-cluster:Connect",`. Like this: ```hocon "Effect": "Allow", "Action": [ "kafka-cluster:Connect", "kafka-cluster:AlterCluster", "kafka-cluster:DescribeCluster" ], ``` Source Config ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "xx.amazonaws.com.cn:9098,xxx.amazonaws.com.cn:9098,xxxx.amazonaws.com.cn:9098" consumer.group = "seatunnel_group" kafka.config = { #security.protocol=SASL_SSL #sasl.mechanism=SCRAM-SHA-512 #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" security.protocol=SASL_SSL sasl.mechanism=AWS_MSK_IAM sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } } ``` ### Kerberos Authentication Example Please set JVM parameters `java.security.krb5.conf` before starting the SeaTunnel or update default `krb5.conf` in `/etc/krb5.conf`. Source Config ``` source { Kafka { topic = "seatunnel" bootstrap.servers = "127.0.0.1:9092" consumer.group = "seatunnel_group" kafka.config = { security.protocol=SASL_PLAINTEXT sasl.kerberos.service.name=kafka sasl.mechanism=GSSAPI sasl.jaas.config="com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" } } } ``` ### Multiple Kafka Source > This is written to the same pg table according to different formats and topics of parsing kafka Perform upsert operations based on the id > Note: Kafka is an unstructured data source and should be use 'tables_configs', and 'table_list' will be removed in the future. ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafka_e2e:9092" tables_configs = [ { topic = "^test-ogg-sou.*" pattern = "true" consumer.group = "ogg_multi_group" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = ogg_json }, { topic = "test-cdc_mds" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } ] } } sink { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" user = test password = test generate_sink_sql = true database = test table = public.sink primary_keys = ["id"] } } ``` ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafka_e2e:9092" table_list = [ { topic = "^test-ogg-sou.*" pattern = "true" consumer.group = "ogg_multi_group" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = ogg_json }, { topic = "test-cdc_mds" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } ] } } sink { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" user = test password = test generate_sink_sql = true database = test table = public.sink primary_keys = ["id"] } } ``` ### Protobuf configuration Set `format` to `protobuf`, configure `protobuf` data structure, `protobuf_message_name` and `protobuf_schema` parameters Example: ```hocon source { Kafka { topic = "test_protobuf_topic_fake_source" format = protobuf protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } ``` ### Protobuf with Schema Registry wire format When consuming Protobuf messages that were encoded using Confluent Schema Registry, you need to set `strip_schema_registry_header` to `true`. The connector will automatically detect and remove the Schema Registry wire format header (magic byte, schema id, and message indexes) before deserializing the Protobuf message. Example: ```hocon source { Kafka { topic = "test_protobuf_schema_registry_topic" format = protobuf strip_schema_registry_header = true protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } ``` **Note**: When `strip_schema_registry_header` is enabled, the connector can safely handle both Schema Registry encoded messages and plain Protobuf messages. If the Schema Registry header is not detected, it will automatically fall back to standard Protobuf deserialization. ``` ### Ignore No Leader Partition When dealing with Kafka clusters that may have temporary leadership issues, you can configure the connector to ignore partitions without a leader: ```hocon source { Kafka { topic = "test_topic" bootstrap.servers = "localhost:9092" consumer.group = "test_group" ignore_no_leader_partition = true start_mode = "earliest" } } ``` With `ignore_no_leader_partition = true`, the connector will skip any partitions that don't have a leader during partition discovery, allowing the job to continue processing other healthy partitions. ### format If you need to retain Kafka's native information, you can refer to the following configuration. Config Example: ```hocon source { Kafka { topic = "test_topic_native_source" bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" format_error_handle_way = skip format = "NATIVE" value_converter_schema_enabled = false consumer.group = "native_group" } } ``` The returned data is as follows: ```json { "headers": { "header1": "header1", "header2": "header2" }, "key": "dGVzdF9ieXRlc19kYXRh", "partition": 3, "timestamp": 1672531200000, "timestampType": "CREATE_TIME", "value": "dGVzdF9ieXRlc19kYXRh" } ``` Note:key/value is of type byte[]. ## Changelog ================================================ FILE: docs/en/connectors/source/Kingbase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Kingbase > JDBC Kingbase Source Connector ## Support Connector Version - 8.6 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Read external data source data through JDBC. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------| | Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
> For example: cp kingbase8-8.6.0.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Kingbase Data type | SeaTunnel Data type | |-------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL | BOOLEAN | | INT2 | SHORT | | SMALLSERIAL
SERIAL
INT4 | INT | | INT8
BIGSERIAL | BIGINT | | FLOAT4 | FLOAT | | FLOAT8 | DOUBLE | | NUMERIC | DECIMAL((Get the designated column's specified column size),
(Gets the designated column's number of digits to right of the decimal point.))) | | BPCHAR
CHARACTER
VARCHAR
TEXT | STRING | | TIMESTAMP | LOCALDATETIME | | TIME | LOCALTIME | | DATE | LOCALDATE | | Other data type | Not supported yet | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:kingbase8://localhost:54321/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.kingbase8.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. | | fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
the row fetch size used in the query to improve performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | | use_regex | Boolean | No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
example:
"test_schema.table1" | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple ``` env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table ``` source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } ``` ### Parallel Boundary > It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" partition_column = "id" partition_num = 10 # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Klaviyo.md ================================================ import ChangeLog from '../changelog/connector-http-klaviyo.md'; # Klaviyo > Klaviyo source connector ## Description Used to read data from Klaviyo. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | private_key | String | Yes | - | | revision | String | Yes | - | | method | String | No | get | | schema | Config | No | - | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### private_key [String] API private key for login, you can get more detail at this link: https://developers.klaviyo.com/en/docs/authenticate_#private-key-authentication ### revision [String] API endpoint revision (format: YYYY-MM-DD) ### method [String] http request method, only supports GET, POST method ### schema [Config] The structure of the data, including field names and field types. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] the schema fields of upstream data ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Klaviyo { url = "https://a.klaviyo.com/api/lists/" private_key = "SeaTunnel-test" revision = "2020-10-17" method = "GET" format = "json" schema = { fields { type = string id = string attributes = { name = string created = string updated = string } links = { self = string } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Kudu.md ================================================ import ChangeLog from '../changelog/connector-kudu.md'; # Kudu > Kudu source connector ## Support Kudu Version - 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Used to read data from Kudu. The tested kudu version is 1.11.1. ## Data Type Mapping | kudu Data Type | SeaTunnel Data Type | |--------------------------|---------------------| | BOOL | BOOLEAN | | INT8
INT16
INT32 | INT | | INT64 | BIGINT | | DECIMAL | DECIMAL | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | UNIXTIME_MICROS | TIMESTAMP | | BINARY | BYTES | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|--------|----------|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | kudu_masters | String | Yes | - | Kudu master address. Separated by ',',such as '192.168.88.110:7051'. | | table_name | String | Yes | - | The name of kudu table. | | client_worker_count | Int | No | 2 * Runtime.getRuntime().availableProcessors() | Kudu worker count. Default value is twice the current number of cpu cores. | | client_default_operation_timeout_ms | Long | No | 30000 | Kudu normal operation time out. | | client_default_admin_operation_timeout_ms | Long | No | 30000 | Kudu admin operation time out. | | enable_kerberos | Bool | No | false | Kerberos principal enable. | | kerberos_principal | String | No | - | Kerberos principal. Note that all zeta nodes require have this file. | | kerberos_keytab | String | No | - | Kerberos keytab. Note that all zeta nodes require have this file. | | kerberos_krb5conf | String | No | - | Kerberos krb5 conf. Note that all zeta nodes require have this file. | | scan_token_query_timeout | Long | No | 30000 | The timeout for connecting scan token. If not set, it will be the same as operationTimeout. | | scan_token_batch_size_bytes | Int | No | 1024 * 1024 | Kudu scan bytes. The maximum number of bytes read at a time, the default is 1MB. | | use_regex | Bool | No | false | Control regular expression matching for `table_name`. When set to `true`, the `table_name` will be treated as a regular expression pattern and can match multiple tables. When set to `false` or not specified, the `table_name` will be treated as an exact table name (no regex matching). | | filter | String | No | - | Kudu scan filter expressions,example id > 100 AND id < 200. | | schema | Map | No | 1024 * 1024 | SeaTunnel Schema. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | table_list | Array | No | - | The list of tables to be read. you can use this configuration instead of `table_name`, for example: ```table_list = [{ table_name = "kudu_source_table_1"},{ table_name = "kudu_source_table_2"}] ```. You can also configure `use_regex = true` inside each entry to enable regex matching for `table_name`. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | ## Task Example ### Simple > The following example is for a Kudu table named "kudu_source_table", The goal is to print the data from this table on the console and write kudu table "kudu_sink_table" ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** kudu { kudu_masters = "kudu-master:7051" table_name = "kudu_source_table" plugin_output = "kudu" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } transform { } sink { console { plugin_input = "kudu" } kudu { plugin_input = "kudu" kudu_masters = "kudu-master:7051" table_name = "kudu_sink_table" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } ``` ### Multiple Table ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** kudu{ kudu_masters = "kudu-master:7051" table_list = [ { table_name = "kudu_source_table_1" },{ table_name = "kudu_source_table_2" } ] plugin_output = "kudu" } } transform { } sink { Assert { rules { table-names = ["kudu_source_table_1", "kudu_source_table_2"] } } } ``` ### Table Matching With Regex The Kudu Source supports using regular expressions on `table_name` to match multiple tables (including whole-database style synchronization, since Kudu tables are in a single logical database). #### Exact Table Name Use `table_name` to specify a single Kudu table with an exact name: ```hocon source { kudu { kudu_masters = "kudu-master:7051" table_name = "kudu_source_table_1" } } ``` #### Regex Matching Use `table_name` as a regex pattern and enable `use_regex` to read multiple tables with one configuration: ```hocon source { kudu { kudu_masters = "kudu-master:7051" # Match tables like kudu_source_table_1, kudu_source_table_2, etc. table_name = "kudu_source_table_\\d+" use_regex = true } } ``` You can also combine regex entries in `table_list`: ```hocon source { kudu { kudu_masters = "kudu-master:7051" table_list = [ { table_name = "kudu_source_table_1" }, { table_name = "kudu_source_table_2" }, { # Regex matching - any table whose name starts with prefix_ and ends with digits table_name = "prefix_\\d+" use_regex = true } ] } } ``` #### Whole-Database Matching You can also synchronize all tables in the current Kudu cluster (or all business tables in the current instance, if there are no system tables) by using a catch-all regex: ```hocon source { kudu { kudu_masters = "kudu-master:7051" # Match all tables in the current Kudu cluster table_name = ".*" use_regex = true } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Lemlist.md ================================================ import ChangeLog from '../changelog/connector-http-lemlist.md'; # Lemlist > Lemlist source connector ## Description Used to read data from Lemlist. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | password | String | Yes | - | | method | String | No | get | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### password [String] API key for login, you can get more detail at this link: https://app.lemlist.com/settings/integrations ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Lemlist { url = "https://api.lemlist.com/api/campaigns" password = "SeaTunnel-test" schema { fields { _id = string name = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/LocalFile.md ================================================ import ChangeLog from '../changelog/connector-file-local.md'; # LocalFile > Local file source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from local file system. :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. ::: ## Options | name | type | required | default value | |----------------------------|---------|----------|--------------------------------------| | path | string | yes | - | | file_format_type | string | yes | - | | read_columns | list | no | - | | delimiter/field_delimiter | string | no | \001 for text and , for csv | | row_delimiter | string | no | \n | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | | time_format | string | no | HH:mm:ss | | skip_header_row_number | long | no | 0 | | schema | config | no | - | | sheet_name | string | no | - | | excel_engine | string | no | POI | | xml_row_tag | string | no | - | | xml_use_attr_format | boolean | no | - | | csv_use_header_line | boolean | no | false | | file_filter_pattern | string | no | - | | filename_extension | string | no | - | | compress_codec | string | no | none | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | null_format | string | no | - | | binary_chunk_size | int | no | 1024 | | binary_complete_file_mode | boolean | no | false | | sync_mode | string | no | full | | target_path | string | no | - | | target_hadoop_conf | map | no | - | | update_strategy | string | no | distcp | | compare_mode | string | no | len_mtime | | common-options | | no | - | | tables_configs | list | no | used to define a multiple table task | | file_filter_modified_start | string | no | - | | file_filter_modified_end | string | no | - | | enable_file_split | boolean | no | false | | file_split_size | long | no | 134217728 | | quote_char | string | no | " | | escape_char | string | no | - | | metalake_type | string | no | gravitino | ### path [string] The source file path. ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. If you assign file type to `text` `csv`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text tyrantlucifer#26#male ``` If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | If you assign file type to `binary`, SeaTunnel can synchronize files in any format, such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization at the same time. You can find the specific usage in the example below. If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### read_columns [list] The read column list of the data source, user can use it to implement field projection. ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. Field delimiter, used to tell connector how to slice and dice fields. default `\001`, the same as hive's default delimiter ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### parse_partition_from_path [boolean] Control whether parse the partition keys and values from file path For example if you read a file from path `file://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` Every record data from file will be added these two fields: | name | age | |---------------|-----| | tyrantlucifer | 26 | Tips: **Do not define partition fields in schema option** ### date_format [string] Date type format, used to tell connector how to convert string to date, supported as the following formats: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` ### datetime_format [string] Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` default `yyyy-MM-dd HH:mm:ss` ### time_format [string] Time type format, used to tell connector how to convert string to time, supported as the following formats: `HH:mm:ss` `HH:mm:ss.SSS` default `HH:mm:ss` ### skip_header_row_number [long] Skip the first few lines, but only for the txt and csv. For example, set like following: `skip_header_row_number = 2` then SeaTunnel will skip the first 2 lines from source files ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). #### fields [Config] The schema information of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### schema_url [string] Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). ### metalake_type [string] The type of metalake service, currently only supports `gravitino`. When using `schema_url` to obtain metadata from Gravitino, you can specify this parameter (default is `gravitino`). For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). ### sheet_name [string] Only need to be configured when file_format is excel. Reader the sheet of the workbook. ### excel_engine [string] Only need to be configured when file_format is excel. supported as the following file types: `POI` `EasyExcel` The default excel reading engine is POI, but POI can easily cause memory overflow when reading Excel with more than 65,000 rows, so you can switch to EasyExcel as the reading engine. ### xml_row_tag [string] Only need to be configured when file_format is xml. Specifies the tag name of the data rows within the XML file. ### xml_use_attr_format [boolean] Only need to be configured when file_format is xml. Specifies Whether to process data using the tag attribute format. ### csv_use_header_line [boolean] Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### filename_extension [string] Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### null_format [string] Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### sync_mode [string] File sync mode. Supported values: `full` (default), `update`. When `update`, the source compares files between source/target and only reads new/changed files (currently only supports `file_format_type=binary`). **Performance considerations** - Update mode triggers an extra `getFileStatus` call on the target for each source file. - It is not recommended for massive small-file scenarios. **Requirements / limitations** - `target_path` should typically align with sink `path` (same filesystem and same relative path layout). - When `update_strategy=distcp`, correctness depends on source/target clock synchronization. - When `compare_mode=checksum`, filesystem checksum support is required. If checksum is unavailable, SeaTunnel falls back to content comparison (more expensive) and logs a warning. Example: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). ### target_hadoop_conf [map] Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem. You can set `fs.defaultFS` in this map to override target defaultFS. ### update_strategy [string] Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. ### compare_mode [string] Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum` (only valid when `update_strategy=strict`). ### file_filter_modified_start [string] File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### file_filter_modified_end [string] File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### enable_file_split [boolean] Turn on the file splitting function, the default is false.It can be selected when the file type is csv, text, json, parquet and non-compressed format. **Recommendations** - Enable when reading a few large files and you want higher read parallelism. - Disable when reading many small files, or when parallelism is low (splitting adds overhead). **Limitations** - Not supported for compressed files (`compress_codec` != `none`) or archive files (`archive_compress_codec` != `none`) — it will fall back to non-splitting. - For `text`/`csv`/`json`, actual split size may be larger than `file_split_size` because the split end is aligned to the next `row_delimiter`. - LocalFile uses Hadoop LocalFileSystem internally; no extra Hadoop configuration is required. ### file_split_size [long] File split size, which can be filled in when the enable_file_split parameter is true. The unit is the number of bytes. The default value is the number of bytes of 128MB, which is 134217728. **Tuning** - Start with the default (128MB). Decrease it if parallelism is under-utilized; increase it if the number of splits is too large. - Rough rule: `file_split_size ≈ file_size / desired_parallelism`. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ### tables_configs Used to define a multiple table task, when you have multiple tables to read, you can use this option to define multiple tables. ## Example ### One Table ```hocon LocalFile { path = "/apps/hive/demo/student" file_format_type = "parquet" } ``` ```hocon LocalFile { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" } ``` For json, text or csv file format with `encoding` ```hocon LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" encoding = "gbk" } ``` ### Multiple Table ```hocon LocalFile { tables_configs = [ { schema { table = "student" } path = "/apps/hive/demo/student" file_format_type = "parquet" }, { schema { table = "teacher" } path = "/apps/hive/demo/teacher" file_format_type = "parquet" } ] } ``` ```hocon LocalFile { tables_configs = [ { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" }, { schema { fields { name = string age = int } } path = "/apps/hive/demo/teacher" file_format_type = "json" } } ``` ### Transfer Binary File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // you can transfer local file to s3/hdfs/oss etc. LocalFile { path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### Incremental Sync (sync_mode=update, binary) `sync_mode=update` compares files between source and `target_path`, then only reads new/changed files. In most cases, `target_path` should be aligned with sink `path` (same filesystem and same relative paths). ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" sync_mode = "update" target_path = "/seatunnel/read/binary2/" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { LocalFile { path = "/seatunnel/read/binary2/" tmp_path = "/seatunnel/read/binary2-tmp/" file_format_type = "binary" } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/data/seatunnel/" file_format_type = "csv" skip_header_row_number = 1 // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Maxcompute.md ================================================ import ChangeLog from '../changelog/connector-maxcompute.md'; # Maxcompute > Maxcompute source connector ## Description Used to read data from Maxcompute. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------|--------|----------|---------------| | accessId | string | yes | - | | accesskey | string | yes | - | | endpoint | string | yes | - | | project | string | yes | - | | table_name | string | yes | - | | partition_spec | string | no | - | | split_row | int | no | 10000 | | read_columns | Array | no | - | | table_list | Array | No | - | | common-options | string | no | | | schema | config | no | | ### accessId [string] `accessId` Your Maxcompute accessId which cloud be access from Alibaba Cloud. ### accesskey [string] `accesskey` Your Maxcompute accessKey which cloud be access from Alibaba Cloud. ### endpoint [string] `endpoint` Your Maxcompute endpoint start with http. ### project [string] `project` Your Maxcompute project which is created in Alibaba Cloud. ### table_name [string] `table_name` Target Maxcompute table name eg: fake. ### partition_spec [string] `partition_spec` This spec of Maxcompute partition table eg:ds='20220101'. ### split_row [int] `split_row` Number of rows per split, default: 10000. ### read_columns [Array] `read_columns` The columns to be read, if not set, all columns will be read. e.g. ["col1", "col2"] ### table_list [Array] The list of tables to be read, you can use this configuration instead of `table_name`. ### tunnel_endpoint [String] Specifies the custom endpoint URL for the MaxCompute Tunnel service. By default, the endpoint is automatically inferred from the configured region. This option allows you to override the default behavior and use a custom Tunnel endpoint. If not specified, the connector will use the region-based default Tunnel endpoint. In general, you do **not** need to set tunnel_endpoint. It is only needed for custom networking, debugging, or local development. Example values: - `https://dt.cn-hangzhou.maxcompute.aliyun.com` - `https://dt.ap-southeast-1.maxcompute.aliyun.com` - `http://maxcompute:8080` Default: Not set (auto-inferred from region) ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Examples ### Read with table ```hocon source { Maxcompute { accessId="" accesskey="" endpoint="" project="" table_name="" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] } } ``` ### Read with table list ```hocon source { Maxcompute { accessId="" accesskey="" endpoint="" project="" # default project table_list = [ { table_name = "test_table" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] }, { project = "test_project" table_name = "test_table2" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] } ] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Milvus.md ================================================ import ChangeLog from '../changelog/connector-milvus.md'; # Milvus > Milvus source connector ## Description This Milvus source connector reads data from Milvus or Zilliz Cloud, it has the following features: - support read and write data by partition - support read dynamic schema data into Metadata Column - json data will be converted to json string and sink as json as well - retry automatically to bypass ratelimit and grpc limit ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) ## Data Type Mapping | Milvus Data Type | SeaTunnel Data Type | |---------------------|---------------------| | INT8 | TINYINT | | INT16 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOL | BOOLEAN | | JSON | STRING | | ARRAY | ARRAY | | VARCHAR | STRING | | FLOAT_VECTOR | FLOAT_VECTOR | | BINARY_VECTOR | BINARY_VECTOR | | FLOAT16_VECTOR | FLOAT16_VECTOR | | BFLOAT16_VECTOR | BFLOAT16_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | ## Source Options | Name | Type | Required | Default | Description | |------------|--------|----------|---------|--------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL to connect to Milvus or Zilliz Cloud. | | token | String | Yes | - | User:password | | database | String | Yes | default | Read data from which database. | | collection | String | No | - | If set, will only read one collection, otherwise will read all collections under database. | ## Task Example ```bash source { Milvus { url = "http://127.0.0.1:19530" token = "username:password" database = "default" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/MongoDB-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-mongodb.md'; # MongoDB CDC > MongoDB CDC source connector ## Support Those Engines > SeaTunnel Zeta
> Flink
## Key Features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The MongoDB CDC connector allows for reading snapshot data and incremental data from MongoDB database. ## Supported DataSource Info In order to use the Mongodb CDC connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|-------------------------------------------------------------------------------------------| | MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-cdc-mongodb) | ## Availability Settings 1.MongoDB version: MongoDB version >= 4.0. 2.Cluster deployment: replica sets or sharded clusters. 3.Storage Engine: WiredTiger Storage Engine. 4.Permissions:changeStream and read ``` // 1) Switch to the target database use // 2) Create role (common permissions for CDC scenarios) db.createRole({ role: "", privileges: [ { resource: { db: "", collection: "" }, actions: [ "collStats", "splitVector", "listDatabases", "find", "listCollections", "changeStream" ] } ], roles: [] }) // 3) Create user and bind read + custom role db.createUser({ user: "", pwd: "", roles: [ { role: "read", db: "" }, { role: "", db: "" } ] }) // 4) Grant additional role to user (use when user exists or additional authorization is needed) db.grantRolesToUser("", [""]) ``` ## Data Type Mapping The following table lists the field data type mapping from MongoDB BSON type to Seatunnel data type. | MongoDB BSON Type | SeaTunnel Data Type | |-------------------|---------------------| | ObjectId | STRING | | String | STRING | | Boolean | BOOLEAN | | Binary | BINARY | | Int32 | INTEGER | | Int64 | BIGINT | | Double | DOUBLE | | Decimal128 | DECIMAL | | Date | DATE | | Timestamp | TIMESTAMP | | Object | ROW | | Array | ARRAY | For specific types in MongoDB, we use Extended JSON format to map them to Seatunnel STRING type. | MongoDB BSON type | SeaTunnel STRING | |-------------------|----------------------------------------------------------------------------------------------| | Symbol | {"_value": {"$symbol": "12"}} | | RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | | JavaScript | {"_value": {"$code": "function() { return 10; }"}} | | DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | **Tips** > 1.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
## Source Options | Name | Type | Required | Default | Description | |------------------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | hosts | String | Yes | - | The comma-separated list of hostname and port pairs of the MongoDB servers. eg. `localhost:27017,localhost:27018` | | username | String | No | - | Name of the database user to be used when connecting to MongoDB. | | password | String | No | - | Password to be used when connecting to MongoDB. | | database | List | Yes | - | Name of the database to watch for changes. If not set then all databases will be captured. The database also supports regular expressions to monitor multiple databases matching the regular expression. eg. `db1,db2`. | | collection | List | Yes | - | Name of the collection in the database to watch for changes. If not set then all collections will be captured. The collection also supports regular expressions to monitor multiple collections matching fully-qualified collection identifiers. eg. `db1.coll1,db2.coll2`. | | schema | | no | - | The structure of the data, including field names and field types, use single table cdc. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | tables_configs | | no | - | The structure of the data, including field names and field types, use muliti table cdc. | | connection.options | String | No | - | The ampersand-separated connection options of MongoDB. eg. `replicaSet=test&connectTimeoutMS=300000`. | | batch.size | Long | No | 1024 | The cursor batch size. | | poll.max.batch.size | Enum | No | 1024 | Maximum number of change stream documents to include in a single batch when polling for new data. | | poll.await.time.ms | Long | No | 1000 | The amount of time to wait before checking for new results on the change stream. | | heartbeat.interval.ms | String | No | 0 | The length of time in milliseconds between sending heartbeat messages. Use 0 to disable. | | incremental.snapshot.chunk.size.mb | Long | No | 64 | The chunk size mb of incremental snapshot. | | exactly_once | Boolean| No | false | Enable exactly once semantic. Enabling this may cause an out-of-memory risk during the large table snapshot stage in recovery. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | ### Tips > 1.If the collection changes at a slow pace, it is strongly recommended to set an appropriate value greater than 0 for the heartbeat.interval.ms parameter. When we recover a Seatunnel job from a checkpoint or savepoint, the heartbeat events can push the resumeToken forward to avoid its expiration.
> 2.MongoDB has a limit of 16MB for a single document. Change documents include additional information, so even if the original document is not larger than 15MB, the change document may exceed the 16MB limit, resulting in the termination of the Change Stream operation.
> 3.It is recommended to use immutable shard keys. In MongoDB, shard keys allow modifications after transactions are enabled, but changing the shard key can cause frequent shard migrations, resulting in additional performance overhead. Additionally, modifying the shard key can also cause the Update Lookup feature to become ineffective, leading to inconsistent results in CDC (Change Data Capture) scenarios.
> 4.`schema` `tables_configs` are mutually exclusive, and one must be configured at a time. ## Change Streams [**Change Stream**](https://www.mongodb.com/docs/v5.0/changeStreams/) is a new feature provided by MongoDB 3.6 for replica sets and sharded clusters that allows applications to access real-time data changes without the complexity and risk of tailing the oplog. Applications can use change streams to subscribe to all data changes on a single collection, a database, or an entire deployment, and immediately react to them. **Lookup Full Document for Update Operations** is a feature provided by **Change Stream** which can configure the change stream to return the most current majority-committed version of the updated document. Because of this feature, we can easily collect the latest full document and convert the change log to Changelog Stream. The format of the data captured by delete events in change streams: [delete envet](https://www.mongodb.com/docs/v5.0/reference/change-events/delete/) ``` { "_id": { }, "operationType": "delete", "clusterTime": , "ns": { "db": "engineering", "coll": "users" }, "documentKey": { "_id": ObjectId("599af247bb69cd89961c986d") } } ``` The fullDocument document is omitted as the document no longer exists at the time the change stream cursor sends the delete event to the client. ## How to Create a MongoDB CDC Data Synchronization Jobs ### CDC Data Print to Client The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and prints it on the local client: ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products"] username = stuser password = stpw schema = { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } } } # Console printing of the read Mongodb data sink { Console { parallelism = 1 } } ``` ## CDC Data Write to MysqlDB The following example demonstrates how to create a data synchronization job that reads cdc data from MongoDB and write to mysql database: ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products"] username = stuser password = stpw schema = { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306" driver = "com.mysql.cj.jdbc.Driver" user = "st_user" password = "seatunnel" generate_sink_sql = true # You need to configure both database and table database = mongodb_cdc table = products primary_keys = ["_id"] } } ``` ## Multi-table Synchronization The following example demonstrates how to create a data synchronization job that read the cdc data of multiple library tables mongodb and prints it on the local client: ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products", "inventory.orders"] username = superuser password = superpw tables_configs = [ { schema { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } }, { schema { table = "inventory.orders" fields { "_id" : string, "order_number" : int, "order_date" : string, "quantity" : int, "product_id" : string } } } ] } } # Console printing of the read Mongodb data sink { Console { } } ``` ## Format of real-time streaming data ```shell { _id : { }, // Identifier of the open change stream, can be assigned to the 'resumeAfter' parameter for subsequent resumption of this change stream "operationType" : "", // The type of change operation that occurred, such as: insert, delete, update, etc. "fullDocument" : { }, // The full document data involved in the change operation. This field does not exist in delete operations "ns" : { "db" : "", // The database where the change operation occurred "coll" : "" // The collection where the change operation occurred }, "to" : { // These fields are displayed only when the operation type is 'rename' "db" : "", // The new database name after the change "coll" : "" // The new collection name after the change }, "source":{ "ts_ms":"", // The timestamp when the change operation occurred "table":"" // The collection where the change operation occurred "db":"", // The database where the change operation occurred "snapshot":"false" // Identify the current stage of data synchronization }, "documentKey" : { "_id" : }, // The _id field value of the document involved in the change operation "updateDescription" : { // Description of the update operation "updatedFields" : { }, // The fields and values that the update operation modified "removedFields" : [ "", ... ] // The fields and values that the update operation removed } "clusterTime" : , // The timestamp of the Oplog log entry corresponding to the change operation "txnNumber" : , // If the change operation is executed in a multi-document transaction, this field and value are displayed, representing the transaction number "lsid" : { // Represents information related to the Session in which the transaction is located "id" : , "uid" : } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/MongoDB.md ================================================ import ChangeLog from '../changelog/connector-mongodb.md'; # MongoDB > MongoDB Source Connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The MongoDB Connector provides the ability to read and write data from and to MongoDB. This document describes how to set up the MongoDB connector to run data reads against MongoDB. ## Supported DataSource Info In order to use the Mongodb connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|---------------------------------------------------------------------------------------| | MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-mongodb) | ## Data Type Mapping The following table lists the field data type mapping from MongoDB BSON type to SeaTunnel data type. | MongoDB BSON type | SeaTunnel Data type | |-------------------|---------------------| | ObjectId | STRING | | String | STRING | | Boolean | BOOLEAN | | Binary | BINARY | | Int32 | INTEGER | | Int64 | BIGINT | | Double | DOUBLE | | Decimal128 | DECIMAL | | Date | Date | | Timestamp | Timestamp | | Object | ROW | | Array | ARRAY | For specific types in MongoDB, we use Extended JSON format to map them to SeaTunnel STRING type. | MongoDB BSON type | SeaTunnel STRING | |-------------------|----------------------------------------------------------------------------------------------| | Symbol | {"_value": {"$symbol": "12"}} | | RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | | JavaScript | {"_value": {"$code": "function() { return 10; }"}} | | DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | **Tips** > 1.When using the DECIMAL type in SeaTunnel, be aware that the maximum range cannot exceed 34 digits, which means you should use decimal(34, 18).
## Source Options | Name | Type | Required | Default | Description | |----------------------|---------|----------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | uri | String | Yes | - | The MongoDB standard connection uri. eg. mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true. | | database | String | Yes | - | The name of MongoDB database to read or write. | | collection | String | Yes | - | The name of MongoDB collection to read or write. | | schema | String | Yes | - | MongoDB's BSON and seatunnel data structure mapping. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | match.query | String | No | - | In MongoDB, filters are used to filter documents for query operations. | | match.projection | String | No | - | In MongoDB, Projection is used to control the fields contained in the query results. | | partition.split-key | String | No | _id | The key of Mongodb fragmentation. | | partition.split-size | Long | No | 64 * 1024 * 1024 | The size of Mongodb fragment. | | cursor.no-timeout | Boolean | No | true | MongoDB server normally times out idle cursors after an inactivity period (10 minutes) to prevent excess memory use. Set this option to true to prevent that. However, if the application takes longer than 30 minutes to process the current batch of documents, the session is marked as expired and closed. | | fetch.size | Int | No | 2048 | Set the number of documents obtained from the server for each batch. Setting the appropriate batch size can improve query performance and avoid the memory pressure caused by obtaining a large amount of data at one time. | | max.time-min | Long | No | 10 | This parameter is a MongoDB query option that limits the maximum execution time for query operations. The value of maxTimeMin is in minutes. If the execution time of the query exceeds the specified time limit, MongoDB will terminate the operation and return an error. | | flat.sync-string | Boolean | No | true | By utilizing flatSyncString, only one field attribute value can be set, and the field type must be a String. This operation will perform a string mapping on a single MongoDB data entry. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### Tips > 1.The parameter `match.query` is compatible with the historical old version parameter `matchQuery`, and they are equivalent replacements.
## How to Create a MongoDB Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that reads data from MongoDB and prints it on the local client: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to connect to Mongodb source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "source_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } } # Console printing of the read Mongodb data sink { Console { parallelism = 1 } } ``` ## Parameter Interpretation ### MongoDB Database Connection URI Examples Unauthenticated single node connection: ```bash mongodb://192.168.0.100:27017/mydb ``` Replica set connection: ```bash mongodb://192.168.0.100:27017/mydb?replicaSet=xxx ``` Authenticated replica set connection: ```bash mongodb://admin:password@192.168.0.100:27017/mydb?replicaSet=xxx&authSource=admin ``` Multi-node replica set connection: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb?replicaSet=xxx ``` Sharded cluster connection: ```bash mongodb://192.168.0.100:27017/mydb ``` Multiple mongos connections: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb ``` Note: The username and password in the URI must be URL-encoded before being concatenated into the connection string. ### MatchQuery Scan In data synchronization scenarios, the matchQuery approach needs to be used early to reduce the number of documents that need to be processed by subsequent operators, thus improving performance. Here is a simple example of a seatunnel using `match.query` ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "orders" match.query = "{status: \"A\"}" schema = { fields { id = bigint status = string } } } } ``` The following are examples of MatchQuery query statements of various data types: ```bash # Query Boolean type "{c_boolean:true}" # Query string type "{c_string:\"OCzCj\"}" # Query the integer "{c_int:2}" # Type of query time "{c_date:ISODate(\"2023-06-26T16:00:00.000Z\")}" # Query floating point type {c_double:{$gte:1.71763202185342e+308}} ``` Please refer to how to write the syntax of `match.query`:https://www.mongodb.com/docs/manual/tutorial/query-documents ### Projection Scan In MongoDB, Projection is used to control which fields are included in the query results. This can be accomplished by specifying which fields need to be returned and which fields do not. In the find() method, a projection object can be passed as a second argument. The key of the projection object indicates the fields to include or exclude, and a value of 1 indicates inclusion and 0 indicates exclusion. Here is a simple example, assuming we have a collection named users: ```bash # Returns only the name and email fields db.users.find({}, { name: 1, email: 0 }); ``` In data synchronization scenarios, projection needs to be used early to reduce the number of documents that need to be processed by subsequent operators, thus improving performance. Here is a simple example of a seatunnel using projection: ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" match.projection = "{ name: 1, email: 0 }" schema = { fields { name = string } } } } ``` ### Partitioned Scan To speed up reading data in parallel source task instances, seatunnel provides a partitioned scan feature for MongoDB collections. The following partitioning strategies are provided. Users can control data sharding by setting the partition.split-key for sharding keys and partition.split-size for sharding size. ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" partition.split-key = "id" partition.split-size = 1024 schema = { fields { id = bigint status = string } } } } ``` ### Flat Sync String By utilizing `flat.sync-string`, only one field attribute value can be set, and the field type must be a String. This operation will perform a string mapping on a single MongoDB data entry. ```bash env { parallelism = 10 job.mode = "BATCH" } source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" flat.sync-string = true schema = { fields { data = string } } } } sink { Console {} } ``` Use the data samples synchronized with modified parameters, such as the following: ```json { "_id":{ "$oid":"643d41f5fdc6a52e90e59cbf" }, "c_map":{ "OQBqH":"jllt", "rkvlO":"pbfdf", "pCMEX":"hczrdtve", "DAgdj":"t", "dsJag":"voo" }, "c_array":[ { "$numberInt":"-865590937" }, { "$numberInt":"833905600" }, { "$numberInt":"-1104586446" }, { "$numberInt":"2076336780" }, { "$numberInt":"-1028688944" } ], "c_string":"bddkzxr", "c_boolean":false, "c_tinyint":{ "$numberInt":"39" }, "c_smallint":{ "$numberInt":"23672" }, "c_int":{ "$numberInt":"-495763561" }, "c_bigint":{ "$numberLong":"3768307617923954543" }, "c_float":{ "$numberDouble":"5.284220288280258E37" }, "c_double":{ "$numberDouble":"1.1706091642478246E308" }, "c_bytes":{ "$binary":{ "base64":"ZWJ4", "subType":"00" } }, "c_date":{ "$date":{ "$numberLong":"1686614400000" } }, "c_decimal":{ "$numberDecimal":"683265300" }, "c_timestamp":{ "$date":{ "$numberLong":"1684283772000" } }, "c_row":{ "c_map":{ "OQBqH":"cbrzhsktmm", "rkvlO":"qtaov", "pCMEX":"tuq", "DAgdj":"jzop", "dsJag":"vwqyxtt" }, "c_array":[ { "$numberInt":"1733526799" }, { "$numberInt":"-971483501" }, { "$numberInt":"-1716160960" }, { "$numberInt":"-919976360" }, { "$numberInt":"727499700" } ], "c_string":"oboislr", "c_boolean":true, "c_tinyint":{ "$numberInt":"-66" }, "c_smallint":{ "$numberInt":"1308" }, "c_int":{ "$numberInt":"-1573886733" }, "c_bigint":{ "$numberLong":"4877994302999518682" }, "c_float":{ "$numberDouble":"1.5353209063652051E38" }, "c_double":{ "$numberDouble":"1.1952441956458565E308" }, "c_bytes":{ "$binary":{ "base64":"cWx5Ymp0Yw==", "subType":"00" } }, "c_date":{ "$date":{ "$numberLong":"1686614400000" } }, "c_decimal":{ "$numberDecimal":"656406177" }, "c_timestamp":{ "$date":{ "$numberLong":"1684283772000" } } }, "id":{ "$numberInt":"2" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/MyHours.md ================================================ import ChangeLog from '../changelog/connector-http-myhours.md'; # My Hours > My Hours source connector ## Support Those Engines > Spark
> Flink
> SeaTunnel Zeta
## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Used to read data from My Hours. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Supported DataSource Info In order to use the My Hours connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|---------------------------------------------------------------------------------------------| | My Hours | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel) | ## Source Options | Name | Type | Required | Default | Description | |-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url. | | email | String | Yes | - | My hours login email address. | | password | String | Yes | - | My hours login password. | | schema | Config | No | - | Http and seatunnel data structure mapping. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | schema.fields | Config | No | - | The schema fields of upstream data | | json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. | | content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. | | format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. | | method | String | No | get | Http request method, only supports GET, POST method. | | headers | Map | No | - | Http headers. | | params | Map | No | - | Http params. | | body | String | No | - | Http body. | | poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. | | retry | Int | No | - | The max retry times if request http return to `IOException`. | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | enable_multi_lines | Boolean | No | false | | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## How to Create a My Hours Data Synchronization Jobs ```hocon env { parallelism = 1 job.mode = "BATCH" } MyHours{ url = "https://api2.myhours.com/api/Projects/getAll" email = "seatunnel@test.com" password = "seatunnel" schema { fields { name = string archived = boolean dateArchived = string dateCreated = string clientName = string budgetAlertPercent = string budgetType = int totalTimeLogged = double budgetValue = double totalAmount = double totalExpense = double laborCost = double totalCost = double billableTimeLogged = double totalBillableAmount = double billable = boolean roundType = int roundInterval = int budgetSpentPercentage = double budgetTarget = int budgetPeriodType = string budgetSpent = string id = string } } } # Console printing of the read data sink { Console { parallelism = 1 } } ``` ## Parameter Interpretation ### format when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### content_json This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ## Changelog ================================================ FILE: docs/en/connectors/source/MySQL-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-mysql.md'; # MySQL CDC > MySQL CDC source connector ## Support Those Engines > SeaTunnel Zeta
> Flink
## Description The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document describes how to set up the MySQL CDC connector to run SQL queries against MySQL databases. ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------|----------------------------------------------------------------------| | MySQL |
  • [MySQL](https://dev.mysql.com/doc): 5.5, 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 | ## Using Dependency ### Install Jdbc Driver #### For Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. #### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ### Creating MySQL user You have to define a MySQL user with appropriate permissions on all databases that the Debezium MySQL connector monitors. 1. Create the MySQL user: ```sql mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password'; ``` 2. Grant the required permissions to the user: ```sql mysql> GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password'; ``` 3. Finalize the user’s permissions: ```sql mysql> FLUSH PRIVILEGES; ``` ### Enabling the MySQL Binlog You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes. 1. Check whether the `log-bin` option is already on: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | ON | | gtid_mode | ON | | log_bin | ON | +--------------------------+----------------+ ``` 2. If the value of `log_bin` is not `on`, configure your MySQL server configuration file(`$MYSQL_HOME/mysql.cnf`) with the following properties, which are described in the table below: ``` # Enable binary replication log and set the prefix, expiration, and log format. # The prefix is arbitrary, expiration can be short for integration tests but would # be longer on a production system. Row-level info is required for ingest to work. # Server ID is required, but this will vary on production systems server-id = 223344 log_bin = mysql-bin expire_logs_days = 10 binlog_format = row # mysql 5.6+ requires binlog_row_image to be set to FULL binlog_row_image = FULL # optional enable gtid mode # mysql 5.6+ requires gtid_mode to be set to ON, but not required by mysql 8.0+ gtid_mode = on enforce_gtid_consistency = on ``` 3. Restart MySQL Server ```shell /etc/inint.d/mysqld restart ``` 4. Confirm your changes by checking the binlog status once more: MySQL 5.5: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | log_bin | ON | +--------------------------+----------------+ ``` MySQL 5.6+: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | ON | | gtid_mode | ON | | log_bin | ON | +--------------------------+----------------+ ``` MySQL 8.0+: ```sql show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency') +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | OFF | | gtid_mode | OFF | | log_bin | ON | +--------------------------+----------------+ ``` ### Notes #### Setting up MySQL session timeouts When an initial consistent snapshot is made for large databases, your established connection could timeout while the tables are being read. You can prevent this behavior by configuring interactive_timeout and wait_timeout in your MySQL configuration file. - `interactive_timeout`: The number of seconds the server waits for activity on an interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout) for more details. - `wait_timeout`: The number of seconds the server waits for activity on a non-interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout) for more details. *For more database settings see [Debezium MySQL Connector](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#setting-up-mysql)* ## Data Type Mapping | Mysql Data Type | SeaTunnel Data Type | |------------------------------------------------------------------------------------------------|---------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | TINYINT | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(p, s)
    DECIMAL(p, s) UNSIGNED
    NUMERIC(p, s)
    NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED
    REAL
    REAL UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    ENUM
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | BINARY
    VARBINAR
    BIT(p)
    TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    GEOMETRY | BYTES | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:mysql://localhost:3306/test`. | | username | String | Yes | - | Name of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | database-names | List | No | - | Database name of the database to monitor. | | database-pattern | String | No | .* | The database names RegEx of the database to capture, for example: `database_prefix.*`. | | table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | | table-pattern | String | Yes | - | The table names RegEx of the database to capture. The table name needs to include the database name, for example: `database.*\\.table_.*` | | table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | startup.mode | Enum | No | INITIAL | Optional startup mode for MySQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` , `specific` and `timestamp`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets.
    `timestamp`: Startup from user-supplied timestamp. | | startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** | | startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** | | startup.timestamp | Long | No | - | Start from the specified timestamp. **Note, This option is required when the `startup.mode` option used `timestamp`.** | | stop.mode | Enum | No | NEVER | Optional stop mode for MySQL CDC consumer, valid enumerations are `never`, `latest` or `specific`.
    `never`: Real-time job don't stop the source.
    `latest`: Stop from the latest offset.
    `specific`: Stop from user-supplied specific offset. | | stop.specific-offset.file | String | No | - | Stop from the specified binlog file name. **Note, This option is required when the `stop.mode` option used `specific`.** | | stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position. **Note, This option is required when the `stop.mode` option used `specific`.** | | snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | | snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | | server-id | String | No | - | A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like `5400`, the numeric ID range syntax is like '5400-5408'.
    Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the
    MySQL cluster as another server (with this unique ID) so it can read the binlog.
    By default, a random number is generated between 6500 and 2,148,492,146, though we recommend setting an explicit value. | | server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | | connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | | connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | | connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | | chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | exactly_once | Boolean | No | false | Enable exactly once semantic. | | format | Enum | No | DEFAULT | Optional output format for MySQL CDC, valid enumerations are `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. | | schema-changes.enabled | Boolean | No | false | Schema evolution is disabled by default. Now we only support `add column`、`drop column`、`rename column` and `modify column`. | | debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#connector-properties) to Debezium Embedded Engine which is used to capture data changes from MySQL server. | | int_type_narrowing | Boolean | No | true | Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. Please refer to `int_type_narrowing` below | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### int_type_narrowing Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. eg: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ## Task Example ### Simple > Support multi-table reading ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 10000 } source { MySQL-CDC { url = "jdbc:mysql://localhost:3306/testdb" username = "root" password = "root@123" table-names = ["testdb.table1", "testdb.table2"] startup.mode = "initial" } } sink { Console { } } ``` ### Support debezium-compatible format send to kafka > Must be used with kafka connector sink, see [compatible debezium format](../formats/cdc-compatible-debezium-json.md) for details ### Support custom primary key for table ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 10000 } source { MySQL-CDC { url = "jdbc:mysql://localhost:3306/testdb" username = "root" password = "root@123" table-names = ["testdb.table1", "testdb.table2"] table-names-config = [ { table = "testdb.table2" primaryKeys = ["id"] } ] } } sink { Console { } } ``` ### Support schema evolution ``` env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = shop table = mysql_cdc_e2e_sink_table_with_schema_change_exactly_once primary_keys = ["id"] is_exactly_once = true xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### Support table-pattern for multi-table reading > `table-pattern` and `table-names` are mutually exclusive ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652 username = "st_user_source" password = "mysqlpw" database-pattern = "source.*" table-pattern = "source.*\\..*" url = "jdbc:mysql://mysql_cdc_e2e:3306" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Mysql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # MySQL > JDBC Mysql Source Connector ## Description Read external data source data through JDBC. ## Support Mysql Version - 5.5/5.6/5.7/8.0/8.1/8.2/8.3/8.4 ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table reading](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------------| | Mysql | Different dependency version has different driver class. | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [Download](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | ## Data Type Mapping | Mysql Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | BYTE | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.)) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | ## Source Options | Name | Type | Required | Default | Description | |--------------------------------------------|------------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | use_regex | Boolean | No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
    example:
    "testdb.table1" | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | int_type_narrowing | Boolean | No | true | Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. Please refer to `int_type_narrowing` below | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### int_type_narrowing Int type narrowing, if true, the tinyint(1) type will be narrowed to the boolean type if without loss of precision. Support for MySQL at now. eg: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ### Options Related To Split #### split.size How many rows in one split, captured tables are split into multiple splits when read of table. #### split.even-distribution.factor.lower-bound > Not recommended for use The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. #### split.even-distribution.factor.upper-bound > Not recommended for use The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. #### split.sample-sharding.threshold This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. #### split.inverse-sampling.rate The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. #### partition_column [string] The column name for split data. #### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. #### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. #### partition_num [int] > Not recommended for use, The correct approach is to control the number of split through `split.size` How many splits do we need to split into, only support positive integer. default value is job parallelism. ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. > > When inferring a primary key based on a `query`, the key is inherited from the underlying table where the first column in the result set is located, and its strictness for the overall join result set is not guaranteed (for example, when the query contains joins or reads from multiple tables). ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### parallel by partition_column ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin" partition_column = "id" split.size = 10000 # Read start boundary #partition_lower_bound = ... # Read end boundary #partition_upper_bound = ... } } sink { Console {} } ``` ### parallel by Primary Key or Unique Index > Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_path = "testdb.table1" query = "select * from testdb.table1" split.size = 10000 } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 properties { useSSL=false } } } ``` ### Multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_list = [ { table_path = "testdb.table1" }, { table_path = "testdb.table2" # Use query filetr rows & columns query = "select id, name from testdb.table2 where id > 100" } ] #where_condition= "where id > 100" #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Neo4j.md ================================================ import ChangeLog from '../changelog/connector-neo4j.md'; # Neo4j > Neo4j source connector ## Description Read data from Neo4j. `neo4j-java-driver` version 4.4.9 ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |----------------------------|--------|----------|---------------| | uri | String | Yes | - | | username | String | No | - | | password | String | No | - | | bearer_token | String | No | - | | kerberos_ticket | String | No | - | | database | String | Yes | - | | query | String | Yes | - | | schema | Object | Yes | - | | max_transaction_retry_time | Long | No | 30 | | max_connection_timeout | Long | No | 30 | ### uri [string] The URI of the Neo4j database. Refer to a case: `neo4j://localhost:7687` ### username [string] username of the Neo4j ### password [string] password of the Neo4j. required if `username` is provided ### bearer_token [string] base64 encoded bearer token of the Neo4j. for Auth. ### kerberos_ticket [string] base64 encoded kerberos ticket of the Neo4j. for Auth. ### database [string] database name. ### query [string] Query statement. ### schema.fields [string] returned fields of `query` see [column projection](../../introduction/concepts/connector-v2-features.md) ### max_transaction_retry_time [long] maximum transaction retry time(seconds). transaction fail if exceeded ### max_connection_timeout [long] The maximum amount of time to wait for a TCP connection to be established (seconds) ## Example ``` source { Neo4j { uri = "neo4j://localhost:7687" username = "neo4j" password = "1234" database = "neo4j" max_transaction_retry_time = 1 max_connection_timeout = 1 query = "MATCH (a:Person) RETURN a.name, a.age" schema { fields { a.age=INT a.name=STRING } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Notion.md ================================================ import ChangeLog from '../changelog/connector-http-notion.md'; # Notion > Notion source connector ## Description Used to read data from Notion. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | password | String | Yes | - | | version | String | Yes | - | | method | String | No | get | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### password [String] API key for login, you can get more detail at this link: https://developers.notion.com/docs/authorization ### version [String] The Notion API is versioned. API versions are named for the date the version is released ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Notion { url = "https://api.notion.com/v1/users" password = "SeaTunnel-test" version = "2022-06-28" content_field = "$.results.*" schema = { fields { object = string id = string type = string person = { email = string } avatar_url = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/ObsFile.md ================================================ import ChangeLog from '../changelog/connector-file-obs.md'; # ObsFile > Obs file source connector ## Support those engines > Spark > > Flink > > Seatunnel Zeta ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] markdown ## Description Read data from huawei cloud obs file system. If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OBS and this connector need some hadoop dependencies. It only supports hadoop version **2.9.X+**. ## Required Jar List | jar | supported versions | maven | |--------------------|-----------------------------|--------------------------------------------------------------------------------------------------------| | hadoop-huaweicloud | support version >= 3.1.1.29 | [Download](https://repo.huaweicloud.com/artifactory/sdk_public/org/apache/hadoop/hadoop-huaweicloud/) | | esdk-obs-java | support version >= 3.19.7.3 | [Download](https://repo.huaweicloud.com/artifactory/sdk_public/com/huawei/storage/esdk-obs-java/) | | okhttp | support version >= 3.11.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | | okio | support version >= 1.14.0 | [Download](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | > Please download the support list corresponding to 'Maven' and copy them to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory. > > And copy all jars to $SEATUNNEL_HOME/lib/ ## Options | name | type | required | default | description | |----------------------------|---------|----------|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The target dir path | | file_format_type | string | yes | - | File type.[Tips](#file_format_type) | | bucket | string | yes | - | The bucket address of obs file system, for example: `obs://obs-bucket-name` | | access_key | string | yes | - | The access key of obs file system | | access_secret | string | yes | - | The access secret of obs file system | | endpoint | string | yes | - | The endpoint of obs file system | | read_columns | list | yes | - | The read column list of the data source, user can use it to implement field projection.[Tips](#read_columns) | | delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files | | row_delimiter | string | no | \n | Row delimiter, used to tell connector how to slice and dice rows when reading text files. Default is `\n` for text files. | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. [Tips](#parse_partition_from_path) | | skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell the connector how to convert string to date.[Tips](#date_format) | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell the connector how to convert string to datetime.[Tips](#datetime_format) | | time_format | string | no | HH:mm:ss | Time type format, used to tell the connector how to convert string to time.[Tips](#time_format) | | filename_extension | string | no | - | Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. | | schema | config | no | - | [Tips](#schema) | | common-options | | no | - | [Tips](#common_options) | | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | file_filter_modified_start | string | no | - | File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | file_filter_modified_end | string | no | - | File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | quote_char | string | no | " | A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. | | escape_char | string | no | - | A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. | ### Tips #### parse_partition_from_path > Control whether parse the partition keys and values from file path > > For example if you read a file from path `obs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` > > Every record data from the file will be added these two fields: | name | age | |---------------|-----| | tyrantlucifer | 26 | > Do not define partition fields in schema option #### date_format > Date type format, used to tell the connector how to convert string to date, supported as the following formats: > > `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` > > default `yyyy-MM-dd` ### datetime_format > Datetime type format, used to tell the connector how to convert string to datetime, supported as the following formats: > > `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` > > default `yyyy-MM-dd HH:mm:ss` ### time_format > Time type format, used to tell the connector how to convert string to time, supported as the following formats: > > `HH:mm:ss` `HH:mm:ss.SSS` > > default `HH:mm:ss` ### skip_header_row_number > Skip the first few lines, but only for the txt and csv. > > For example, set like following: > > `skip_header_row_number = 2` > > Then Seatunnel will skip the first 2 lines from source files ### file_format_type > File type, supported as the following file types: > > `text` `csv` `parquet` `orc` `json` `excel` `markdown` > > If you assign file type to `json`, you should also assign schema option to tell the connector how to parse data to the row you want. > > For example,upstream data is the following: > > ```json > > ``` {"code": 200, "data": "get success", "success": true} ``` > You can also save multiple pieces of data in one file and split them by one newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` > you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` > connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | > If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. > > If you assign file type to `text` `csv`, you can choose to specify the schema information or not. > > For example, upstream data is the following: ```text tyrantlucifer#26#male ``` > If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | > If you assign data schema, you should also assign the option `delimiter` too except CSV file type > > you should assign schema and delimiter as the following: ```hocon delimiter = "#" schema { fields { name = string age = int gender = string } } ``` > connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | > If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. > The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. > Each element is converted to a row with the following schema: > - `element_id`: Unique identifier for the element > - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) > - `heading_level`: Level of heading (1-6, null for non-heading elements) > - `text`: Text content of the element > - `page_number`: Page number (default: 1) > - `position_index`: Position index within the document > - `parent_id`: ID of the parent element > - `child_ids`: Comma-separated list of child element IDs > > Note: Markdown format only supports reading, not writing. #### schema ##### fields > The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### read_columns > The read column list of the data source, user can use it to implement field projection. > > The file type supported column projection as the following shown: - text - json - csv - orc - parquet - excel > If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured #### common options > Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Task Example ### text file > For text file format simple config ```hocon ObsFile { path = "/seatunnel/text" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "text" } ``` ### parquet file > For parquet file format simple config ```hocon ObsFile { path = "/seatunnel/parquet" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "parquet" } ``` ### orc file > For orc file format simple config ```hocon ObsFile { path = "/seatunnel/orc" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "orc" } ``` ### json file > For json file format simple config ```hocon ObsFile { path = "/seatunnel/json" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "json" } ``` ### excel file > For excel file format simple config ```hocon ObsFile { path = "/seatunnel/excel" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "excel" } ``` ### csv file > For csv file format simple config ```hocon ObsFile { path = "/seatunnel/csv" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "csv" delimiter = "," } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/OceanBase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # OceanBase > JDBC OceanBase Source Connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Read external data source data through JDBC. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| | OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
    > For example: cp oceanbase-client-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping ### Mysql Mode | Mysql Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | BYTE | | TINYINT
    TINYINT UNSIGNED | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | ### Oracle Mode | Oracle Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------------------|---------------------| | Integer | DECIMAL(38,0) | | Number(p), p <= 9 | INT | | Number(p), p <= 18 | BIGINT | | Number(p), p > 18 | DECIMAL(38,18) | | Number(p,s) | DECIMAL(p,s) | | Float | DECIMAL(38,18) | | REAL
    BINARY_FLOAT | FLOAT | | BINARY_DOUBLE | DOUBLE | | CHAR
    NCHAR
    VARCHAR
    VARCHAR2
    NVARCHAR2
    NCLOB
    CLOB
    LONG
    XML
    ROWID | STRING | | DATE | TIMESTAMP | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | | UNKNOWN | Not supported yet | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. | | fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
    the row fetch size used in the query to improve performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple ``` env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" partition_column = "id" partition_num = 10 # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/OneSignal.md ================================================ import ChangeLog from '../changelog/connector-http-onesignal.md'; # OneSignal > OneSignal source connector ## Description Used to read data from OneSignal. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | password | String | Yes | - | | method | String | No | get | | schema | Config | No | - | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### password [String] Auth key for login, you can get more detail at this link: https://documentation.onesignal.com/docs/accounts-and-keys#user-auth-key ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon OneSignal { url = "https://onesignal.com/api/v1/apps" password = "SeaTunnel-test" schema = { fields { id = string name = string gcm_key = string chrome_key = string chrome_web_key = string chrome_web_origin = string chrome_web_gcm_sender_id = string chrome_web_default_notification_icon = string chrome_web_sub_domain = string apns_env = string apns_certificates = string apns_p8 = string apns_team_id = string apns_key_id = string apns_bundle_id = string safari_apns_certificate = string safari_site_origin = string safari_push_id = string safari_icon_16_16 = string safari_icon_32_32 = string safari_icon_64_64 = string safari_icon_128_128 = string safari_icon_256_256 = string site_name = string created_at = string updated_at = string players = int messageable_players = int basic_auth_key = string additional_data_is_root_payload = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/OpenMldb.md ================================================ import ChangeLog from '../changelog/connector-openmldb.md'; # OpenMldb > OpenMldb source connector ## Description Used to read data from OpenMldb. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------|---------|----------|---------------| | cluster_mode | boolean | yes | - | | sql | string | yes | - | | database | string | yes | - | | host | string | no | - | | port | int | no | - | | zk_path | string | no | - | | zk_host | string | no | - | | session_timeout | int | no | 10000 | | request_timeout | int | no | 60000 | | common-options | | no | - | ### cluster_mode [string] OpenMldb is or not cluster mode ### sql [string] Sql statement ### database [string] Database name ### host [string] OpenMldb host, only supported on OpenMldb single mode ### port [int] OpenMldb port, only supported on OpenMldb single mode ### zk_host [string] Zookeeper host, only supported on OpenMldb cluster mode ### zk_path [string] Zookeeper path, only supported on OpenMldb cluster mode ### session_timeout [int] OpenMldb session timeout(ms), default 60000 ### request_timeout [int] OpenMldb request timeout(ms), default 10000 ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon OpenMldb { host = "172.17.0.2" port = 6527 sql = "select * from demo_table1" database = "demo_db" cluster_mode = false } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Opengauss-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-opengauss.md'; # Opengauss CDC > Opengauss CDC source connector ## Support Those Engines > SeaTunnel Zeta
    > Flink
    ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The Opengauss CDC connector allows for reading snapshot data and incremental data from Opengauss database. This document describes how to set up the Opengauss CDC connector to run SQL queries against Opengauss databases. ## Using steps > Here are the steps to enable CDC (Change Data Capture) in Opengauss: 1. Ensure the wal_level is set to logical, you can use SQL commands to modify the configuration directly: ```sql ALTER SYSTEM SET wal_level TO 'logical'; SELECT pg_reload_conf(); ``` 2. Change the REPLICA policy of the specified table to FULL ```sql ALTER TABLE your_table_name REPLICA IDENTITY FULL; ``` If you have multi tables,you can use the result of this sql to change the REPLICA policy of all tables to FULL ```sql select 'ALTER TABLE ' || schemaname || '.' || tablename || ' REPLICA IDENTITY FULL;' from pg_tables where schemaname = 'YourTableSchema' ``` ## Data Type Mapping | Opengauss Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | BYTEA
    | BYTES | | INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | | INT8
    BIGSERIAL
    | BIGINT | | FLOAT4
    | FLOAT | | FLOAT8
    | DOUBLE | | NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | | NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB | STRING | | TIMESTAMP
    | TIMESTAMP | | TIME
    | TIME | | DATE
    | DATE | | OTHER DATA TYPES | NOT SUPPORTED YET | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|----------|----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. | | username | String | Yes | - | Username of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | database-names | List | No | - | Database name of the database to monitor. | | table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | | table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | | startup.mode | Enum | No | INITIAL | Optional startup mode for Opengauss CDC consumer, valid enumerations are `initial`, `earliest`, `latest`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset. | | snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | | snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | | slot.name | String | No | - | The name of the Opengauss logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. | | decoding.plugin.name | String | No | pgoutput | The name of the Postgres logical decoding plug-in installed on the server,Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming,wal2json_rds_streaming and pgoutput. | | server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | | connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | | connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | | connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | | chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | exactly_once | Boolean | No | false | Enable exactly once semantic. | | format | Enum | No | DEFAULT | Optional output format for Opengauss CDC, valid enumerations are `DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. | | debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) to Debezium Embedded Engine which is used to capture data changes from Opengauss server. | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Task Example ### Simple > Support multi-table reading ``` env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { Opengauss-CDC { plugin_output = "customers_opengauss_cdc" username = "gaussdb" password = "openGauss@123" database-names = ["opengauss_cdc"] schema-names = ["inventory"] table-names = ["opengauss_cdc.inventory.opengauss_cdc_table_1","opengauss_cdc.inventory.opengauss_cdc_table_2"] url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc" decoding.plugin.name = "pgoutput" } } transform { } sink { jdbc { plugin_input = "customers_opengauss_cdc" url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc" driver = "org.postgresql.Driver" username = "dailai" password = "openGauss@123" compatible_mode="postgresLow" generate_sink_sql = true # You need to configure both database and table database = "opengauss_cdc" schema = "inventory" tablePrefix = "sink_" primary_keys = ["id"] } } ``` ### Support custom primary key for table ``` source { Opengauss-CDC { plugin_output = "customers_opengauss_cdc" username = "gaussdb" password = "openGauss@123" database-names = ["opengauss_cdc"] schema-names = ["inventory"] table-names = ["opengauss_cdc.inventory.full_types_no_primary_key"] url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc?loggerLevel=OFF" decoding.plugin.name = "pgoutput" exactly_once = true table-names-config = [ { table = "opengauss_cdc.inventory.full_types_no_primary_key" primaryKeys = ["id"] } ] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Oracle-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-oracle.md'; # Oracle CDC > Oracle CDC source connector ## Support Those Engines > SeaTunnel Zeta
    > Flink
    ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The Oracle CDC connector allows for reading snapshot data and incremental data from Oracle database. This document describes how to set up the Oracle CDC connector to run SQL queries against Oracle databases. ## Notice The Debezium Oracle connector does not rely on the continuous mining option. The connector is responsible for detecting log switches and adjusting the logs that are mined automatically, which the continuous mining option did for you automatically. So, you can not set this property named `log.mining.continuous.mine` in the debezium. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| | Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## Database Dependency ### Install Jdbc Driver #### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. > 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATUNNEL_HOME/plugins/` directory. #### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. > 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATUNNEL_HOME/lib/` directory. ### Enable Oracle Logminer > To enable Oracle CDC (Change Data Capture) using Logminer in Seatunnel, which is a built-in tool provided by Oracle, follow the steps below: #### Enabling Logminer without CDB (Container Database) mode. 1. The operating system creates an empty file directory to store Oracle archived logs and user tablespaces. ```shell mkdir -p /opt/oracle/oradata/recovery_area mkdir -p /opt/oracle/oradata/ORCLCDB chown -R oracle /opt/oracle/*** ``` 2. Login as admin and enable Oracle archived logs. ```sql sqlplus /nolog; connect sys as sysdba; alter system set db_recovery_file_dest_size = 10G; alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; shutdown immediate; startup mount; alter database archivelog; alter database open; ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; archive log list; ``` 3. Login as admin and create an account called logminer_user with the password "oracle", and grant it privileges to read tables and logs. ```sql CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; CREATE USER logminer_user IDENTIFIED BY oracle DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs; GRANT CREATE SESSION TO logminer_user; GRANT SELECT ON V_$DATABASE to logminer_user; GRANT SELECT ON V_$LOG TO logminer_user; GRANT SELECT ON V_$LOGFILE TO logminer_user; GRANT SELECT ON V_$LOGMNR_LOGS TO logminer_user; GRANT SELECT ON V_$LOGMNR_CONTENTS TO logminer_user; GRANT SELECT ON V_$ARCHIVED_LOG TO logminer_user; GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO logminer_user; GRANT EXECUTE ON DBMS_LOGMNR TO logminer_user; GRANT EXECUTE ON DBMS_LOGMNR_D TO logminer_user; GRANT SELECT ANY TRANSACTION TO logminer_user; GRANT SELECT ON V_$TRANSACTION TO logminer_user; ``` ##### Oracle 11g is not supported ```sql GRANT LOGMINING TO logminer_user; ``` ##### Grant privileges only to the tables that need to be collected ```sql GRANT SELECT ANY TABLE TO logminer_user; GRANT ANALYZE ANY TO logminer_user; ``` #### To enable Logminer in Oracle with CDB (Container Database) + PDB (Pluggable Database) mode 1. The operating system creates an empty file directory to store Oracle archived logs and user tablespaces. ```shell mkdir -p /opt/oracle/oradata/recovery_area mkdir -p /opt/oracle/oradata/ORCLCDB mkdir -p /opt/oracle/oradata/ORCLCDB/ORCLPDB1 chown -R oracle /opt/oracle/*** ``` 2. Login as admin and enable logging ```sql sqlplus /nolog connect sys as sysdba; # Password: oracle alter system set db_recovery_file_dest_size = 10G; alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; shutdown immediate startup mount alter database archivelog; alter database open; archive log list; ``` 3. Executing in CDB ```sql ALTER TABLE TEST.* ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; ALTER TABLE TEST.T2 ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; ``` 4. Creating debeziume account > Operating in CDB ```sql sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; exit; ``` > Operating in PDB ```sql sqlplus sys/top_secret@//localhost:1521/ORCLPDB1 as sysdba CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/ORCLPDB1/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; exit; ``` 5. Operating in CDB ```sql sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba CREATE USER c##dbzuser IDENTIFIED BY dbz DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs CONTAINER=ALL; GRANT CREATE SESSION TO c##dbzuser CONTAINER=ALL; GRANT SET CONTAINER TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$DATABASE to c##dbzuser CONTAINER=ALL; GRANT FLASHBACK ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ANY TRANSACTION TO c##dbzuser CONTAINER=ALL; GRANT LOGMINING TO c##dbzuser CONTAINER=ALL; GRANT CREATE TABLE TO c##dbzuser CONTAINER=ALL; GRANT LOCK ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT CREATE SEQUENCE TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE ON DBMS_LOGMNR TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE ON DBMS_LOGMNR_D TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOG TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOG_HISTORY TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_LOGS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_CONTENTS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_PARAMETERS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGFILE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$ARCHIVED_LOG TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO c##dbzuser CONTAINER=ALL; GRANT analyze any TO debeziume_1 CONTAINER=ALL; exit; ``` ## Data Type Mapping | Oracle Data type | SeaTunnel Data type | |--------------------------------------------------------------------------------------|---------------------| | INTEGER | INT | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(precision == 0, scale == 0) | DECIMAL(38, 18) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
    REAL | FLOAT | | CHAR
    NCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    | STRING | | DATE | DATE | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `idbc:oracle:thin:datasource01:1523:xe`. | | username | String | Yes | - | Name of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | database-names | List | No | - | Database name of the database to monitor. | | schema-names | List | No | - | Schema name of the database to monitor. | | table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | | table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | startup.mode | Enum | No | INITIAL | Optional startup mode for Oracle CDC consumer, valid enumerations are `initial`, `earliest`, `latest`, `timestamp` and `specific`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets. | | startup.timestamp | Long | No | - | Start from the specified timestamp (milliseconds since Unix epoch). This timestamp is converted with `server-time-zone` when `startup.mode = timestamp`. **Note, This option is required when the `startup.mode` option used `timestamp`.** | | startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** | | startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** | | stop.mode | Enum | No | NEVER | Optional stop mode for Oracle CDC consumer, valid enumerations are `never`, `latest` or `specific`.
    `never`: Real-time job don't stop the source.
    `latest`: Stop from the latest offset.
    `specific`: Stop from user-supplied specific offset. | | stop.specific-offset.file | String | No | - | Stop from the specified binlog file name. **Note, This option is required when the `stop.mode` option used `specific`.** | | stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position. **Note, This option is required when the `stop.mode` option used `specific`.** | | snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | | snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | | server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. This value is also used when converting `startup.timestamp` to SCN. Set it explicitly when database time zone and JVM time zone are different. | | connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | | connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | | connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | | chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | exactly_once | Boolean | No | false | Enable exactly once semantic. | | use_select_count | Boolean | No | false | Use select count for table count rather then other methods in full stage.In this scenario, select count directly is used when it is faster to update statistics using sql from analysis table | | skip_analyze | Boolean | No | false | Skip the analysis of table count in full stage.In this scenario, you schedule analysis table sql to update related table statistics periodically or your table data does not change frequently | | format | Enum | No | DEFAULT | Optional output format for Oracle CDC, valid enumerations are `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. | | schema-changes.enabled | Boolean | No | false | Schema evolution is disabled by default. Now we only support `add column`、`drop column`、`rename column` and `modify column`. | | debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#connector-properties) to Debezium Embedded Engine which is used to capture data changes from Oracle server. | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | | decimal_type_narrowing | Boolean | No | true | Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. Please refer to `decimal_type_narrowing` below | ### decimal_type_narrowing Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. eg: decimal_type_narrowing = true | Oracle | SeaTunnel | |---------------|-----------| | NUMBER(1, 0) | Boolean | | NUMBER(6, 0) | INT | | NUMBER(10, 0) | BIGINT | decimal_type_narrowing = false | Oracle | SeaTunnel | |---------------|----------------| | NUMBER(1, 0) | Decimal(1, 0) | | NUMBER(6, 0) | Decimal(6, 0) | | NUMBER(10, 0) | Decimal(10, 0) | ## Task Example ### Simple > Support multi-table reading ```conf source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES", "XE.DEBEZIUM.FULL_TYPES2"] url = "jdbc:oracle:thin:@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` > Use the select count(*) instead of analysis table for count table rows in full stage ```conf source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" use_select_count = true username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` > Use the select NUM_ROWS from all_tables for the table rows but skip the analyze table. ```conf source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" skip_analyze = true username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` ### Support custom primary key for table ```conf source { Oracle-CDC { plugin_output = "customers" url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] table-names-config = [ { table = "XE.DEBEZIUM.FULL_TYPES" primaryKeys = ["ID"] } ] } } ``` ### Support debezium-compatible format send to kafka > Must be used with kafka connector sink, see [compatible debezium format](../formats/cdc-compatible-debezium-json.md) for details ## Changelog ================================================ FILE: docs/en/connectors/source/Oracle.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Oracle > JDBC Oracle Source Connector ## Description Read external data source data through JDBC. ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| | Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## Database Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. > 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATUNNEL_HOME/plugins/` directory. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) has been placed in directory `${SEATUNNEL_HOME}/lib/`. > 2. To support the i18n character set, copy the `orai18n.jar` to the `$SEATUNNEL_HOME/lib/` directory. ## Data Type Mapping | Oracle Data Type | SeaTunnel Data Type | |----------------------------------------------------------------------------------------------------------|---------------------| | INTEGER | DECIMAL(38,0) | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
    REAL | FLOAT | | CHAR
    NCHAR
    VARCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    XML
    | STRING | | DATE | TIMESTAMP | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use Oracle the value is `oracle.jdbc.OracleDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in Oracle, properties take precedence over the URL. | | use_regex | Boolean | No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
    example:
    "test_schema.table1" | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | decimal_type_narrowing | Boolean | No | true | Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. Please refer to `decimal_type_narrowing` below | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### decimal_type_narrowing Decimal type narrowing, if true, the decimal type will be narrowed to the int or long type if without loss of precision. Only support for Oracle at now. eg: decimal_type_narrowing = true | Oracle | SeaTunnel | |---------------|-----------| | NUMBER(1, 0) | Boolean | | NUMBER(6, 0) | INT | | NUMBER(10, 0) | BIGINT | decimal_type_narrowing = false | Oracle | SeaTunnel | |---------------|----------------| | NUMBER(1, 0) | Decimal(1, 0) | | NUMBER(6, 0) | Decimal(6, 0) | | NUMBER(10, 0) | Decimal(10, 0) | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ### Options Related To Split #### split.size How many rows in one split, captured tables are split into multiple splits when read of table. #### split.even-distribution.factor.lower-bound > Not recommended for use The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. #### split.even-distribution.factor.upper-bound > Not recommended for use The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. #### split.sample-sharding.threshold This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. #### split.inverse-sampling.rate The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. #### partition_column [string] The column name for split data. #### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. #### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. #### partition_num [int] > Not recommended for use, The correct approach is to control the number of split through `split.size` How many splits do we need to split into, only support positive integer. default value is job parallelism. ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = "root" password = "123456" query = "SELECT * FROM TEST_TABLE" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### parallel by partition_column > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "SELECT * FROM TEST_TABLE" # Parallel sharding reads fields partition_column = "ID" # Number of fragments partition_num = 10 properties { database.oracle.jdbc.timezoneAsRegion = "false" } } } sink { Console {} } ``` ### parallel by Primary Key or Unique Index > Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_path = "DA.SCHEMA1.TABLE1" query = "select * from SCHEMA1.TABLE1" split.size = 10000 } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "SELECT * FROM TEST_TABLE" partition_column = "ID" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } } ``` ### Multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" "table_list"=[ { "table_path"="XE.TEST.USER_INFO" }, { "table_path"="XE.TEST.YOURTABLENAME" } ] #where_condition= "where id > 100" split.size = 10000 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/OssFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss.md'; # OssFile > Oss file source connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Usage Dependency ### For Spark/Flink Engine 1. You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. 2. You must ensure `hadoop-aliyun-xx.jar`, `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` in `${SEATUNNEL_HOME}/plugins/` dir and the version of `hadoop-aliyun` jar need equals your hadoop version which used in spark/flink and `aliyun-sdk-oss-xx.jar` and `jdom-xx.jar` version needs to be the version corresponding to the `hadoop-aliyun` version. Eg: `hadoop-aliyun-3.1.4.jar` dependency `aliyun-sdk-oss-3.4.1.jar` and `jdom-1.1.jar`. ### For SeaTunnel Zeta Engine 1. You must ensure `seatunnel-hadoop3-3.1.4-uber.jar`, `aliyun-sdk-oss-3.4.1.jar`, `hadoop-aliyun-3.1.4.jar` and `jdom-1.1.jar` in `${SEATUNNEL_HOME}/lib/` dir. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Data Type Mapping Data type mapping is related to the type of file being read, We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `markdown` ### JSON File Type If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | ### Text Or CSV File Type If you set the `file_format_type` to `text`,`excel`,`csv`,`xml`. Then it's required to set the `schema` field to tell connector how to parse data to the row. If you set the `schema` field, you should also set the option `field_delimiter`, except the `file_format_type` is `csv`, `xml`, `excel` you can set schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | ### Orc File Type If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. | Orc Data type | SeaTunnel Data type | |----------------------------------|----------------------------------------------------------------| | BOOLEAN | BOOLEAN | | INT | INT | | BYTE | BYTE | | SHORT | SHORT | | LONG | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BINARY | BINARY | | STRING
    VARCHAR
    CHAR
    | STRING | | DATE | LOCAL_DATE_TYPE | | TIMESTAMP | LOCAL_DATE_TIME_TYPE | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType, This type of K and V will transform to SeaTunnel type | | STRUCT | SeaTunnelRowType | ### Parquet File Type If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. | Parquet Data type | SeaTunnel Data type | |----------------------|----------------------------------------------------------------| | INT_8 | BYTE | | INT_16 | SHORT | | DATE | DATE | | TIMESTAMP_MILLIS | TIMESTAMP | | INT64 | LONG | | INT96 | TIMESTAMP | | BINARY | BYTES | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOLEAN | BOOLEAN | | FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType, This type of K and V will transform to SeaTunnel type | | STRUCT | SeaTunnelRowType | ## Options | name | type | required | default value | Description | |----------------------------|---------|----------|---------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The Oss path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | | file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` | | bucket | string | yes | - | The bucket address of oss file system, for example: `oss://seatunnel-test`. | | endpoint | string | yes | - | fs oss endpoint | | read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` `xml` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. | | access_key | string | no | - | | | access_secret | string | no | - | | | delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | | row_delimiter | string | no | \n | Row delimiter, used to tell connector how to slice and dice rows when reading text files. Default `\n`. | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | | time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` | | filename_extension | string | no | - | Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. | | skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | schema | config | no | - | The schema of upstream data. | | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | compress_codec | string | no | none | Which compress codec the files used. | | encoding | string | no | UTF-8 | | null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` | | binary_chunk_size | int | no | 1024 | Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. | | binary_complete_file_mode | boolean | no | false | Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. | | file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | | file_filter_modified_start | string | no | - | File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | file_filter_modified_end | string | no | - | File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | quote_char | string | no | " | A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. | | escape_char | string | no | - | A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. | | metalake_type | string | no | gravitino | The type of metalake service, currently supports `gravitino`. | ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). #### fields [Config] The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### schema_url [string] Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). ### metalake_type [string] The type of metalake service, currently only supports `gravitino`. When using `schema_url` to obtain metadata from Gravitino, you can specify this parameter (default is `gravitino`). For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). ## How to Create a Oss Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that reads data from Oss and prints it on the local client: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to connect to Oss source { OssFile { path = "/seatunnel/orc" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" } } # Console printing of the read Oss data sink { Console { } } ``` ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to connect to Oss source { OssFile { path = "/seatunnel/json" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "json" schema { fields { id = int name = string } } } } # Console printing of the read Oss data sink { Console { } } ``` ### Multiple Table No need to config schema file type, eg: `orc`. ``` env { parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { OssFile { tables_configs = [ { schema = { table = "fake01" } bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/orc" file_format_type = "orc" }, { schema = { table = "fake02" } bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/orc" file_format_type = "orc" } ] plugin_output = "fake" } } sink { Assert { rules { table-names = ["fake01", "fake02"] } } } ``` Need config schema file type, eg: `json` ``` env { execution.parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { OssFile { tables_configs = [ { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/json" file_format_type = "json" schema = { table = "fake01" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } }, { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/json" file_format_type = "json" schema = { table = "fake02" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } ] plugin_output = "fake" } } sink { Assert { rules { table-names = ["fake01", "fake02"] } } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { OssFile { path = "/seatunnel/orc" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" // file filter by modified date between 20240101 and 20240105(not include), actually 20240104 is end date file_filter_modified_start = "2024-01-01 00:00:00" file_filter_modified_end = "2024-01-05 00:00:00" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/OssJindoFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss-jindo.md'; # OssJindoFile > OssJindo file source connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from aliyun oss file system using jindo api. :::tip You need to download [jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) and then unzip it, copy jindo-sdk-4.6.1.jar and jindo-core-4.6.1.jar from lib to ${SEATUNNEL_HOME}/lib. If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to OSS and this connector need some hadoop dependencies. It only supports hadoop version **2.9.X+**. ::: ## Options | name | type | required | default value | |----------------------------|---------|----------|-----------------------------| | path | string | yes | - | | file_format_type | string | yes | - | | bucket | string | yes | - | | access_key | string | yes | - | | access_secret | string | yes | - | | endpoint | string | yes | - | | read_columns | list | no | - | | delimiter/field_delimiter | string | no | \001 for text and , for csv | | row_delimiter | string | no | \n | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | | time_format | string | no | HH:mm:ss | | skip_header_row_number | long | no | 0 | | schema | config | no | - | | sheet_name | string | no | - | | xml_row_tag | string | no | - | | xml_use_attr_format | boolean | no | - | | csv_use_header_line | boolean | no | false | | file_filter_pattern | string | no | | | compress_codec | string | no | none | | archive_compress_codec | string | no | none | | encoding | string | no | UTF-8 | | null_format | string | no | - | | common-options | | no | - | | file_filter_modified_start | string | no | - | | file_filter_modified_end | string | no | - | | quote_char | string | no | " | | escape_char | string | no | - | ### path [string] The source file path. ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. If you assign file type to `text` `csv`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text tyrantlucifer#26#male ``` If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | If you assign file type to `binary`, SeaTunnel can synchronize files in any format, such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization at the same time. You can find the specific usage in the example below. If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### bucket [string] The bucket address of oss file system, for example: `oss://tyrantlucifer-image-bed` ### access_key [string] The access key of oss file system. ### access_secret [string] The access secret of oss file system. ### endpoint [string] The endpoint of oss file system. ### read_columns [list] The read column list of the data source, user can use it to implement field projection. ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. Field delimiter, used to tell connector how to slice and dice fields. default `\001`, the same as hive's default delimiter ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### parse_partition_from_path [boolean] Control whether parse the partition keys and values from file path For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` Every record data from file will be added these two fields: | name | age | |---------------|-----| | tyrantlucifer | 26 | Tips: **Do not define partition fields in schema option** ### date_format [string] Date type format, used to tell connector how to convert string to date, supported as the following formats: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` ### datetime_format [string] Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` default `yyyy-MM-dd HH:mm:ss` ### time_format [string] Time type format, used to tell connector how to convert string to time, supported as the following formats: `HH:mm:ss` `HH:mm:ss.SSS` default `HH:mm:ss` ### skip_header_row_number [long] Skip the first few lines, but only for the txt and csv. For example, set like following: `skip_header_row_number = 2` then SeaTunnel will skip the first 2 lines from source files ### schema [config] Only need to be configured when the file_format_type are text, json, excel, xml or csv ( Or other format we can't read the schema from metadata). #### fields [Config] The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### sheet_name [string] Only need to be configured when file_format is excel. Reader the sheet of the workbook. ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. File Structure Example: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### filename_extension [string] Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### null_format [string] Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` ### file_filter_modified_start [string] File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### file_filter_modified_end [string] File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Example ```hocon OssJindoFile { path = "/seatunnel/orc" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" } ``` ```hocon OssJindoFile { path = "/seatunnel/json" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "json" schema { fields { id = int name = string } } } ``` ### Transfer Binary File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { OssJindoFile { bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" path = "/seatunnel/read/binary/" file_format_type = "binary" } } sink { // you can transfer local file to s3/hdfs/oss etc. OssJindoFile { bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { OssJindoFile { bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" path = "/seatunnel/read/binary/" file_format_type = "binary" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Paimon.md ================================================ import ChangeLog from '../changelog/connector-paimon.md'; # Paimon > Paimon source connector ## Description Read data from Apache Paimon. ### Comparison between SeaTunnel and Paimon version | Seatunnel Version | Paimon Version | |-------------------|------------------| | 2.3.2 - 2.3.3 | 0.4-SNAPSHOT | | 2.3.4 | 0.6-SNAPSHOT | | 2.3.5 - 2.3.11 | 0.7.0-incubating | | 2.3.12 - 2.3.13 | 1.1.1 | ### Key Considerations for Upgrading Paimon from `0.7.0-incubating` to `1.1.1` 1. **Backup Recommendations** Although compatibility is ensured, it is strongly recommended to backup critical data, especially the metadata directory, before initiating the upgrade. 2. **Gradual Upgrade Process** - **Test Environment Validation**: First validate the upgrade process in a staging environment. - **Update JAR Files**: Replace Paimon JAR files with version 1.1.1. - **Automatic Format Upgrade**: The system will automatically detect and upgrade older file formats. 3. **Configuration Check** Review your configurations to ensure no deprecated options are in use. While most configurations remain backward-compatible, deprecated settings may require updates. 4. **Post-Upgrade Validation** Verify the following after upgrading: - **Read/Write Operations**: Ensure data ingestion and retrieval workflows function normally. - **Query Performance**: Confirm that query response times meet expectations. - **New Feature Verification**: Test all newly introduced features (e.g., time travel, enhanced compaction) to ensure proper functionality. **Note**: These steps help minimize risks and ensure a smooth transition to the stable version 1.1.1. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------------|----------|----------------|---------------| | warehouse | String | Yes | - | | catalog_type | String | No | filesystem | | catalog_uri | String | No | - | | database | String | Yes | - | | table | String | no | - | | table_list | array | no | - | | user | String | No | - | | password | String | No | - | | hdfs_site_path | String | No | - | | query | String | No | - | | paimon.hadoop.conf | Map | No | - | | paimon.hadoop.conf-path | String | No | - | ### warehouse [string] Paimon warehouse path ### catalog_type [string] Catalog type of Paimon, support filesystem and hive ### catalog_uri [string] Catalog uri of Paimon, only needed when catalog_type is hive ### database [string] The database you want to access ### table [string] The table you want to access ### table_list [array] The list of tables to be read, you can use this configuration instead of `table` ### hdfs_site_path [string] The file path of `hdfs-site.xml` ### query [string] The filter condition of the table read. For example: `select * from st_test where id > 100`. If not specified, all rows are read. Currently, where conditions only support <, <=, >, >=, =, !=, or, and,is null, is not null, between...and, in, not in, like, and others are not supported. The Having, Group By, Order By clauses are currently unsupported, because these clauses are not supported by Paimon. you can also project specific columns, for example: select id, name from st_test where id > 100. Supports dynamic options settings: ```sql SELECT * FROM table /*+ OPTIONS('incremental-between' = 'test-tag1,test-tag2') */; ``` Note: When the field after the where condition is a string or boolean value, its value must be enclosed in single quotes, otherwise an error will be reported. `For example: name='abc' or tag='true'` The field data types currently supported by where conditions are as follows: * string * boolean * tinyint * smallint * int * bigint * float * double * date * timestamp * time ### paimon.hadoop.conf [string] Properties in hadoop conf ### paimon.hadoop.conf-path [string] The specified loading path for the 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' files ## Filesystems The Paimon connector supports writing data to multiple file systems. Currently, the supported file systems are hdfs and s3. If you use the s3 filesystem. You can configure the `fs.s3a.access-key`、`fs.s3a.secret-key`、`fs.s3a.endpoint`、`fs.s3a.path.style.access`、`fs.s3a.aws.credentials.provider` properties in the `paimon.hadoop.conf` option. Besides, the warehouse should start with `s3a://`. ## Examples ### Simple example ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table = "st_test" } } ``` ### Multiple tables ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table_list = [ { table = "table1" query = "select * from table1 where id > 100" }, { table = "table2" query = "select * from table2 where id > 100" } ] } } ``` ### Filter example ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test" query = "select c_boolean, c_tinyint from st_test where c_boolean= 'true' and c_tinyint > 116 and c_smallint = 15987 or c_decimal='2924137191386439303744.39292213'" } } ``` ### S3 example ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Paimon { warehouse = "s3a://test/" database = "seatunnel_namespace11" table = "st_test" paimon.hadoop.conf = { fs.s3a.access-key=G52pnxg67819khOZ9ezX fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF fs.s3a.endpoint="http://minio4:9000" fs.s3a.path.style.access=true fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider } } } sink { Console{} } ``` ### Hadoop conf example ```hocon source { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/paimon" database="seatunnel_namespace1" table="st_test" query = "select * from st_test where pk_id is not null and pk_id < 3" paimon.hadoop.conf = { hadoop_user_name = "hdfs" fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ### Hive catalog example ```hocon source { Paimon { catalog_name="seatunnel_test" catalog_type="hive" catalog_uri="thrift://hadoop04:9083" warehouse="hdfs:///tmp/seatunnel" database="seatunnel_test" table="st_test3" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ## Changelog If you want to read the changelog of the Paimon table, first set the `changelog-producer` for the Paimon source table and then use the SeaTunnel stream task to read it. ### Note Currently, batch reads are always the latest snapshot read, so to read full changelog data, you need to use stream reads and start stream reads before writing data to the Paimon table, and to ensure order, the parallelism of the stream read task should be set to 1. ### Streaming read example ```hocon env { parallelism = 1 job.mode = "Streaming" } source { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test" } } sink { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test_sink" paimon.table.primary-keys = "c_tinyint" } } ``` ### paimon enable privilege example ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table = "st_test" user = "paimon" password = "******" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Persistiq.md ================================================ import ChangeLog from '../changelog/connector-http-persistiq.md'; # Persistiq > Persistiq source connector ## Description Used to read data from Persistiq. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [schema projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|---------------| | url | String | Yes | - | | password | String | Yes | - | | method | String | No | get | | schema | Config | No | - | | schema.fields | Config | No | - | | format | String | No | json | | params | Map | No | - | | body | String | No | - | | json_field | Config | No | - | | content_json | String | No | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### password [String] API key for login, you can get it at Persistiq website ### method [String] http request method, only supports GET, POST method ### params [Map] http params ### body [String] http body ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | when you assign format is `text`, connector will do nothing for upstream data, for example: upstream data is the following: ```json { "code": 200, "data": "get success", "success": true } ``` connector will generate data as the following: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] The schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### content_json [String] This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. If your return data looks something like this. ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can configure `content_field = "$.store.book.*"` and the result returned looks like this: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` Then you can get the desired result with a simpler schema,like ```hocon Http { url = "http://example.com/xyz" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` You can get the contents of 'book' by configuring the task as follows: ```hocon source { Http { url = "http://example.com/xyz" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ```hocon Persistiq{ url = "https://api.persistiq.com/v1/users" password = "Your password" content_field = "$.users.*" schema = { fields { id = string name = string email = string activated = boolean default_mailbox_id = string salesforce_id = string } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Phoenix.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Phoenix > Phoenix source connector ## Description Read Phoenix data through [Jdbc connector](Jdbc.md). Support Batch mode and Streaming mode. The tested Phoenix version is 4.xx and 5.xx On the underlying implementation, through the jdbc driver of Phoenix, execute the upsert statement to write data to HBase. Two ways of connecting Phoenix with Java JDBC. One is to connect to zookeeper through JDBC, and the other is to connect to queryserver through JDBC thin client. > Tips: By default, the (thin) driver jar is used. If you want to use the (thick) driver or other versions of Phoenix (thin) driver, you need to recompile the jdbc connector module ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) supports query SQL and can achieve projection effect. - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options ### driver [string] if you use phoenix (thick) driver the value is `org.apache.phoenix.jdbc.PhoenixDriver` or you use (thin) driver the value is `org.apache.phoenix.queryserver.client.Driver` ### url [string] if you use phoenix (thick) driver the value is `jdbc:phoenix:localhost:2182/hbase` or you use (thin) driver the value is `jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example use thick client drive ``` Jdbc { driver = org.apache.phoenix.jdbc.PhoenixDriver url = "jdbc:phoenix:localhost:2182/hbase" query = "select age, name from test.source" } ``` use thin client drive ``` Jdbc { driver = org.apache.phoenix.queryserver.client.Driver url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" query = "select age, name from test.source" } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/PostgreSQL-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-postgres.md'; # PostgreSQL CDC > PostgreSQL CDC source connector ## Support Those Engines > SeaTunnel Zeta
    > Flink
    ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The Postgre CDC connector allows for reading snapshot data and incremental data from Postgre database. This document describes how to set up the Postgre CDC connector to run SQL queries against Postgre databases. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | If you want to manipulate the GEOMETRY/GEOGRAPHY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## Using Dependency ### Install Jdbc Driver #### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. #### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. Please download and put PostgreSQL driver in `${SEATUNNEL_HOME}/lib/` dir. For example: cp postgresql-xxx.jar `$SEATUNNEL_HOME/lib/` > Here are the steps to enable CDC (Change Data Capture) in PostgreSQL: 1. Ensure the wal_level is set to logical: Modify the postgresql.conf configuration file by adding "wal_level = logical", restart the PostgreSQL server for the changes to take effect. Alternatively, you can use SQL commands to modify the configuration directly: ```sql ALTER SYSTEM SET wal_level TO 'logical'; SELECT pg_reload_conf(); ``` 2. Change the REPLICA policy of the specified table to FULL ```sql ALTER TABLE your_table_name REPLICA IDENTITY FULL; ``` ## Data Type Mapping | PostgreSQL Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | _BOOL
    | ARRAY<BOOLEAN> | | BYTEA
    | BYTES | | _BYTEA
    | ARRAY<TINYINT> | | INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | | _INT2
    _INT4
    | ARRAY<INT> | | INT8
    BIGSERIAL
    | BIGINT | | _INT8
    | ARRAY<BIGINT> | | FLOAT4
    | FLOAT | | _FLOAT4
    | ARRAY<FLOAT> | | FLOAT8
    | DOUBLE | | _FLOAT8
    | ARRAY<DOUBLE> | | NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | | NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB | STRING | | _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | | TIMESTAMP
    | TIMESTAMP | | TIME
    | TIME | | DATE
    | DATE | | OTHER DATA TYPES | NOT SUPPORTED YET | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|----------|----------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. | | username | String | Yes | - | Name of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | database-names | List | No | - | Database name of the database to monitor. | | table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` | | table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | startup.mode | Enum | No | INITIAL | Optional startup mode for PostgreSQL CDC consumer, valid enumerations are `initial`, `earliest` and `latest`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset. | | snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. | | snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | | slot.name | String | No | - | The name of the PostgreSQL logical decoding slot that was created for streaming changes from a particular plug-in for a particular database/schema. The server uses this slot to stream events to the connector that you are configuring. Default is seatunnel. | | decoding.plugin.name | String | No | pgoutput | The name of the Postgres logical decoding plug-in installed on the server,Supported values are decoderbufs, wal2json, wal2json_rds, wal2json_streaming,wal2json_rds_streaming and pgoutput. | | server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | | connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. | | connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | | connection.pool.size | Integer | No | 20 | The jdbc connection pool size. | | chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | exactly_once | Boolean | No | false | Enable exactly once semantic. | | format | Enum | No | DEFAULT | Optional output format for PostgreSQL CDC, valid enumerations are `DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. | | debezium | Config | No | - | Pass-through [Debezium's properties](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) to Debezium Embedded Engine which is used to capture data changes from PostgreSQL server. | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Task Example ### Simple > Support multi-table reading ``` env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { Postgres-CDC { plugin_output = "customers_Postgre_cdc" username = "postgres" password = "postgres" database-names = ["postgres_cdc"] schema-names = ["inventory"] table-names = ["postgres_cdc.inventory.postgres_cdc_table_1,postgres_cdc.inventory.postgres_cdc_table_2"] url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" } } transform { } sink { jdbc { plugin_input = "customers_Postgre_cdc" url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" driver = "org.postgresql.Driver" username = "postgres" password = "postgres" generate_sink_sql = true # You need to configure both database and table database = postgres_cdc schema = "inventory" tablePrefix = "sink_" primary_keys = ["id"] } } ``` ### Support custom primary key for table ``` source { Postgres-CDC { plugin_output = "customers_mysql_cdc" username = "postgres" password = "postgres" database-names = ["postgres_cdc"] schema-names = ["inventory"] table-names = ["postgres_cdc.inventory.full_types_no_primary_key"] url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" decoding.plugin.name = "decoderbufs" exactly_once = false table-names-config = [ { table = "postgres_cdc.inventory.full_types_no_primary_key" primaryKeys = ["id"] } ] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/PostgreSQL.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # PostgreSQL > JDBC PostgreSQL Source Connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/org.postgresql/postgresql) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Description Read external data source data through JDBC. ## Supported DataSource Info | Datasource | Supported Versions | Driver | Url | Maven | |------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## Database Dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
    > For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/
    > If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | PostgreSQL Data type | SeaTunnel Data type | |--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | _BOOL
    | ARRAY<BOOLEAN> | | BYTEA
    | BYTES | | _BYTEA
    | ARRAY<TINYINT> | | INT2
    SMALLSERIAL | SMALLINT | | _INT2 | ARRAY<SMALLINT> | | INT4
    SERIAL
    | INT | | _INT4
    | ARRAY<INT> | | INT8
    BIGSERIAL
    | BIGINT | | _INT8
    | ARRAY<BIGINT> | | FLOAT4
    | FLOAT | | _FLOAT4
    | ARRAY<FLOAT> | | FLOAT8
    | DOUBLE | | _FLOAT8
    | ARRAY<DOUBLE> | | NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | | NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB
    UUID | STRING | | _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | | TIMESTAMP(s)
    TIMESTAMPTZ(s) | TIMESTAMP(s) | | TIME(s)
    TIMETZ(s) | TIME(s) | | DATE
    | DATE | ## Options | Name | Type | Required | Default | Description | |--------------------------------------------|------------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | use_regex | Boolean | No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
    example:
    "testdb.test_schema.table1" | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ### Options Related To Split #### split.size How many rows in one split, captured tables are split into multiple splits when read of table. #### split.even-distribution.factor.lower-bound > Not recommended for use The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. #### split.even-distribution.factor.upper-bound > Not recommended for use The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. #### split.sample-sharding.threshold This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. #### split.inverse-sampling.rate The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. #### partition_column [string] The column name for split data. #### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. #### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. #### partition_num [int] > Not recommended for use, The correct approach is to control the number of split through `split.size` How many splits do we need to split into, only support positive integer. default value is job parallelism. ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" user = "root" password = "test" query = "select * from source limit 16" } } transform { # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### parallel by partition_column > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` env { parallelism = 4 job.mode = "BATCH" } source{ jdbc{ url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" user = "root" password = "test" query = "select * from source" partition_column= "id" partition_num = 5 } } sink { Console {} } ``` ### parallel by Primary Key or Unique Index > Configuring `table_path` will turn on auto split, you can configure `split.*` to adjust the split strategy ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" table_path = "test.public.AllDataType_1" query = "select * from public.AllDataType_1" split.size = 10000 } } sink { Console {} } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source{ jdbc{ url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" user = "root" password = "test" query = "select * from source" partition_column= "id" # The name of the table returned plugin_output = "jdbc" partition_lower_bound = 1 partition_upper_bound = 50 partition_num = 5 } } ``` ### Multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url="jdbc:postgresql://datasource01:5432/demo" user="iDm82k6Q0Tq+wUprWnPsLQ==" driver="org.postgresql.Driver" password="iDm82k6Q0Tq+wUprWnPsLQ==" "table_list"=[ { "table_path"="demo.public.AllDataType_1" }, { "table_path"="demo.public.alldatatype" } ] #where_condition= "where id > 100" split.size = 10000 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Prometheus.md ================================================ import ChangeLog from '../changelog/connector-prometheus.md'; # Prometheus > Prometheus source connector ## Description Used to read data from Prometheus. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------------|---------|----------|-----------------| | url | String | Yes | - | | query | String | Yes | - | | query_type | String | Yes | Instant | | content_field | String | Yes | $.data.result.* | | schema.fields | Config | Yes | - | | format | String | No | json | | params | Map | Yes | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http request url ### query [String] Prometheus expression query string ### query_type [String] Instant/Range 1. Instant : The following endpoint evaluates an instant query at a single point in time 2. Range : The following endpoint evaluates an expression query over a range of time https://prometheus.io/docs/prometheus/latest/querying/api/ ### params [Map] http request params ### poll_interval_millis [int] request http api interval(millis) in stream mode ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] The retry-backoff times(millis) multiplier if request http failed ### retry_backoff_max_ms [int] The maximum retry-backoff times(millis) if request http failed ### format [String] the format of upstream data, default `json`. ### schema [Config] Fill in a fixed value ```hocon schema = { fields { metric = "map" value = double time = long } } ``` #### fields [Config] the schema fields of upstream data ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example ### Instant ```hocon source { Prometheus { plugin_output = "http" url = "http://mockserver:1080" query = "up" query_type = "Instant" content_field = "$.data.result.*" format = "json" schema = { fields { metric = "map" value = double time = long } } } } ``` ### Range ```hocon source { Prometheus { plugin_output = "http" url = "http://mockserver:1080" query = "up" query_type = "Range" content_field = "$.data.result.*" format = "json" start = "2024-07-22T20:10:30.781Z" end = "2024-07-22T20:11:00.781Z" step = "15s" schema = { fields { metric = "map" value = double time = long } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Pulsar.md ================================================ import ChangeLog from '../changelog/connector-pulsar.md'; # Apache Pulsar > Apache Pulsar source connector ## Description Source connector for Apache Pulsar. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------------------|---------|----------|---------------| | topic | String | No | - | | topic-pattern | String | No | - | | topic-discovery.interval | Long | No | -1 | | subscription.name | String | Yes | - | | client.service-url | String | Yes | - | | admin.service-url | String | Yes | - | | auth.plugin-class | String | No | - | | auth.params | String | No | - | | poll.timeout | Integer | No | 100 | | poll.interval | Long | No | 50 | | poll.batch.size | Integer | No | 500 | | cursor.startup.mode | Enum | No | LATEST | | cursor.startup.timestamp | Long | No | - | | cursor.reset.mode | Enum | No | LATEST | | cursor.stop.mode | Enum | No | NEVER | | cursor.stop.timestamp | Long | No | - | | schema | config | No | - | | common-options | | no | - | | format | String | no | json | ### topic [String] Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by semicolon like 'topic-1;topic-2'. **Note, only one of "topic-pattern" and "topic" can be specified for sources.** ### topic-pattern [String] The regular expression for a pattern of topic names to read from. All topics with names that match the specified regular expression will be subscribed by the consumer when the job starts running. **Note, only one of "topic-pattern" and "topic" can be specified for sources.** ### topic-discovery.interval [Long] The interval (in ms) for the Pulsar source to discover the new topic partitions. A non-positive value disables the topic partition discovery. **Note, This option only works if the 'topic-pattern' option is used.** ### subscription.name [String] Specify the subscription name for this consumer. This argument is required when constructing the consumer. ### client.service-url [String] Service URL provider for Pulsar service. To connect to Pulsar using client libraries, you need to specify a Pulsar protocol URL. You can assign Pulsar protocol URLs to specific clusters and use the Pulsar scheme. For example, `localhost`: `pulsar://localhost:6650,localhost:6651`. ### admin.service-url [String] The Pulsar service HTTP URL for the admin endpoint. For example, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. ### auth.plugin-class [String] Name of the authentication plugin. ### auth.params [String] Parameters for the authentication plugin. For example, `key1:val1,key2:val2` ### poll.timeout [Integer] The maximum time (in ms) to wait when fetching records. A longer time increases throughput but also latency. ### poll.interval [Long] The interval time(in ms) when fetcing records. A shorter time increases throughput, but also increases CPU load. ### poll.batch.size [Integer] The maximum number of records to fetch to wait when polling. A longer time increases throughput but also latency. ### cursor.startup.mode [Enum] Startup mode for Pulsar consumer, valid values are `'EARLIEST'`, `'LATEST'`, `'SUBSCRIPTION'`, `'TIMESTAMP'`. ### cursor.startup.timestamp [Long] Start from the specified epoch timestamp (in milliseconds). **Note, This option is required when the "cursor.startup.mode" option used `'TIMESTAMP'`.** ### cursor.reset.mode [Enum] Cursor reset strategy for Pulsar consumer valid values are `'EARLIEST'`, `'LATEST'`. **Note, This option only works if the "cursor.startup.mode" option used `'SUBSCRIPTION'`.** ### cursor.stop.mode [String] Stop mode for Pulsar consumer, valid values are `'NEVER'`, `'LATEST'`and `'TIMESTAMP'`. **Note, When `'NEVER' `is specified, it is a real-time job, and other mode are off-line jobs.** ### cursor.stop.timestamp [Long] Stop from the specified epoch timestamp (in milliseconds). **Note, This option is required when the "cursor.stop.mode" option used `'TIMESTAMP'`.** ### schema [Config] The structure of the data, including field names and field types. reference to [Schema-Feature](../../introduction/concepts/schema-feature.md) ## format [String] Data format. The default format is json, reference [formats](../formats). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Example ```Jdbc { source { Pulsar { topic = "example" subscription.name = "seatunnel" client.service-url = "pulsar://localhost:6650" admin.service-url = "http://my-broker.example.com:8080" plugin_output = "test" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Qdrant.md ================================================ import ChangeLog from '../changelog/connector-qdrant.md'; # Qdrant > Qdrant source connector ## Description [Qdrant](https://qdrant.tech/) is a high-performance vector search engine and vector database. This connector can be used to read data from a Qdrant collection. ## Options | name | type | required | default value | |-----------------|--------|----------|---------------| | collection_name | string | yes | - | | schema | config | yes | - | | host | string | no | localhost | | port | int | no | 6334 | | api_key | string | no | - | | use_tls | int | no | false | | common-options | | no | - | ### collection_name [string] The name of the Qdrant collection to read data from. ### schema [config] The schema of the table to read data into. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). Eg: ```hocon schema = { fields { age = int address = string some_vector = float_vector } } ``` Each entry in Qdrant is called a point. The `float_vector` type columns are read from the vectors of each point, others are read from the JSON payload associated with the point. If a column is marked as primary key, the ID of the Qdrant point is written into it. It can be of type `"string"` or `"int"`. Since Qdrant only [allows](https://qdrant.tech/documentation/concepts/points/#point-ids) positive integers and UUIDs as point IDs. If the collection was created with a single default/unnamed vector, use `default_vector` as the vector name. ```hocon schema = { fields { age = int address = string default_vector = float_vector } } ``` The ID of the point in Qdrant will be written into the column which is marked as the primary key. It can be of type `int` or `string`. ### host [string] The host name of the Qdrant instance. Defaults to "localhost". ### port [int] The gRPC port of the Qdrant instance. ### api_key [string] The API key to use for authentication if set. ### use_tls [bool] Whether to use TLS(SSL) connection. Required if using Qdrant cloud(https). ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. ## Changelog ================================================ FILE: docs/en/connectors/source/Rabbitmq.md ================================================ import ChangeLog from '../changelog/connector-rabbitmq.md'; # Rabbitmq > Rabbitmq source connector ## Description Used to read data from Rabbitmq. ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) :::tip The source must be non-parallel (parallelism set to 1) in order to achieve exactly-once. This limitation is mainly due to RabbitMQ’s approach to dispatching messages from a single queue to multiple consumers. ::: ## Options | name | type | required | default value | | -------------------------- | ------- | -------- | ------------- | | host | string | yes | - | | port | int | yes | - | | virtual_host | string | yes | - | | username | string | yes | - | | password | string | yes | - | | queue_name | string | yes | - | | schema | config | yes | - | | url | string | no | - | | routing_key | string | no | - | | exchange | string | no | - | | network_recovery_interval | int | no | - | | topology_recovery_enabled | boolean | no | - | | automatic_recovery_enabled | boolean | no | - | | connection_timeout | int | no | - | | requested_channel_max | int | no | - | | requested_frame_max | int | no | - | | requested_heartbeat | int | no | - | | prefetch_count | int | no | - | | delivery_timeout | long | no | - | | common-options | | no | - | | durable | boolean | no | true | | exclusive | boolean | no | false | | auto_delete | boolean | no | false | ### host [string] the default host to use for connections ### port [int] the default port to use for connections ### virtual_host [string] virtual host – the virtual host to use when connecting to the broker ### username [string] the AMQP user name to use when connecting to the broker ### password [string] the password to use when connecting to the broker ### url [string] convenience method for setting the fields in an AMQP URI: host, port, username, password and virtual host ### queue_name [string] the queue to publish the message to ### routing_key [string] the routing key to publish the message to ### exchange [string] the exchange to publish the message to ### schema [Config] #### fields [Config] the schema fields of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### network_recovery_interval [int] how long will automatic recovery wait before attempting to reconnect, in ms ### topology_recovery [string] if true, enables topology recovery ### automatic_recovery [string] if true, enables connection recovery ### connection_timeout [int] connection tcp establishment timeout in milliseconds; zero for infinite ### requested_channel_max [int] initially requested maximum channel number; zero for unlimited **Note: Note the value must be between 0 and 65535 (unsigned short in AMQP 0-9-1). ### requested_frame_max [int] the requested maximum frame size ### requested_heartbeat [int] Set the requested heartbeat timeout **Note: Note the value must be between 0 and 65535 (unsigned short in AMQP 0-9-1). ### prefetch_count [int] prefetchCount the max number of messages to receive without acknowledgement ### delivery_timeout [long] deliveryTimeout maximum wait time, in milliseconds, for the next message delivery ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ### durable - true: The queue will survive on server restart. - false: The queue will be deleted on server restart. ### exclusive - true: The queue is used only by the current connection and will be deleted when the connection closes. - false: The queue can be used by multiple connections. ### auto-delete - true: The queue will be deleted automatically when the last consumer unsubscribes. - false: The queue will not be automatically deleted. ## Example simple: ```hocon source { RabbitMQ { host = "rabbitmq-e2e" port = 5672 virtual_host = "/" username = "guest" password = "guest" queue_name = "test" schema = { fields { id = bigint c_map = "map" c_array = "array" } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Redis.md ================================================ import ChangeLog from '../changelog/connector-redis.md'; # Redis > Redis source connector ## Description Used to read data from Redis. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |---------------------| ------ |--------------------------------| ------------- | | host | string | yes when mode=single | - | | port | int | no | 6379 | | keys | string | yes | - | | read_key_enabled | boolean| no | false | | key_field_name | string | yes when read_key_enabled=true | key | | batch_size | int | yes | 10 | | data_type | string | yes | - | | user | string | no | - | | auth | string | no | - | | db_num | int | no | 0 | | mode | string | no | single | | hash_key_parse_mode | string | no | all | | nodes | list | yes when mode=cluster | - | | schema | config | yes when format=json | - | | format | string | no | json | | single_field_name | string | yes when read_key_enabled=true | - | | field_delimiter | string | no | ',' | | common-options | | no | - | ### host [string] redis host ### port [int] redis port ### hash_key_parse_mode [string] hash key parse mode, support `all` `kv`, used to tell connector how to parse hash key. when setting it to `all`, connector will treat the value of hash key as a row and use the schema config to parse it, when setting it to `kv`, connector will treat each kv in hash key as a row and use the schema config to parse it: for example, if the value of hash key is the following shown: ```text { "001": { "name": "tyrantlucifer", "age": 26 }, "002": { "name": "Zongwen", "age": 26 } } ``` if hash_key_parse_mode is `all` and schema config as the following shown, it will generate the following data: ```hocon schema { fields { 001 { name = string age = int } 002 { name = string age = int } } } ``` | 001 | 002 | | ------------------------------- | ------------------------- | | Row(name=tyrantlucifer, age=26) | Row(name=Zongwen, age=26) | if hash_key_parse_mode is `kv` and schema config as the following shown, it will generate the following data: ```hocon schema { fields { hash_key = string name = string age = int } } ``` | hash_key | name | age | | -------- | ------------- | ---- | | 001 | tyrantlucifer | 26 | | 002 | Zongwen | 26 | each kv that in hash key it will be treated as a row and send it to upstream. **Tips: connector will use the first field information of schema config as the field name of each k that in each kv** ### keys [string] keys pattern ### read_key_enabled [boolean] This option determines whether the Redis source connector includes the Redis key in each output record when reading data. When set to `true`, both the key and its associated value are included in the record. By default (`false`), only the value is read and included. If you are using a single-value Redis data type (such as `string`, `int`, etc.) with `read_key_enabled = true`, you must also specify `single_field_name` to map the value to a schema column, and `key_field_name` to map the Redis key. Note: When `read_key_enabled = true`, the schema configuration must explicitly include the key field to correctly map the deserialized data. Example : ```hocon schema { fields { key = string value = string } } ``` ### key_field_name [string] Specifies the field name to store the Redis key in the output record when `read_key_enabled = true` or `data_type = hash`. - When read_key_enabled = true, the default field name will be `key`. - When data_type = hash and this option is not set, the default field name will be `hash_key`. This field is useful when the default field name conflicts with existing schema fields, or if a more descriptive name is preferred. Example : ```hocon key_field_name = custom_key hash_key_parse_mode = kv format = "json" schema = { fields { custom_key = string name = string } } ``` ### batch_size [int] indicates the number of keys to attempt to return per iteration,default 10 **Tips:Redis source connector support fuzzy key matching, user needs to ensure that the matched keys are the same type** ### data_type [string] redis data types, support `key` `hash` `list` `set` `zset` - key > The value of each key will be sent downstream as a single row of data. > For example, the value of key is `SeaTunnel test message`, the data received downstream is `SeaTunnel test message` and only one message will be received. - hash > The hash key-value pairs will be formatted as json to be sent downstream as a single row of data. > For example, the value of hash is `name:tyrantlucifer age:26`, the data received downstream is `{"name":"tyrantlucifer", "age":"26"}` and only one message will be received. - list > Each element in the list will be sent downstream as a single row of data. > For example, the value of list is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. - set > Each element in the set will be sent downstream as a single row of data > For example, the value of set is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. - zset > Each element in the sorted set will be sent downstream as a single row of data > For example, the value of sorted set is `[tyrantlucier, CalvinKirs]`, the data received downstream are `tyrantlucifer` and `CalvinKirs` and only two message will be received. ### user [string] redis authentication user, you need it when you connect to an encrypted cluster ### auth [string] redis authentication password, you need it when you connect to an encrypted cluster ### db_num [int] Redis database index ID. It is connected to db 0 by default ### mode [string] redis mode, `single` or `cluster`, default is `single` ### nodes [list] redis nodes information, used in cluster mode, must like as the following format: ["host1:port1", "host2:port2"] ### format [string] the format of upstream data, now only support `json` `text`, default `json`. when you assign format is `json`, you should also assign schema option, for example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | | ---- | ----------- | ------- | | 200 | get success | true | when you assign format is `text`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text 200#get success#true ``` If you do not assign data schema connector will treat the upstream data as the following: | content | | -------------------------------------------------------- | | 200#get success#true | If you assign data schema, you should also assign the option `schema` and `field_delimiter` as following: ```hocon field_delimiter = "#" schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | content | | -------------------------------------------------------- | | {"code": 200, "data": "get success", "success": true} | ### field_delimiter [string] Field delimiter, used to tell connector how to slice and dice fields. Currently, only need to be configured when format is text. default is ",". ### schema [config] #### fields [config] The schema fields of redis data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ### single_field_name [string] Specifies the field name for Redis values when `read_key_enabled = true` and the value is a single primitive (e.g., `string`, `int`). This name is used in the schema to map the value field. **Note:** This option has no effect when reading complex Redis data types such as hashes or objects that can be directly mapped to a schema. Example : ```hocon read_key_enabled = true key_field_name = key single_field_name = value schema { fields { key = string value = string } } ``` ### common options Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details ## Example simple: ```hocon Redis { host = localhost port = 6379 keys = "key_test*" data_type = key format = text } ``` ```hocon Redis { host = localhost port = 6379 keys = "key_test*" data_type = key format = json schema { fields { name = string age = int } } } ``` read string type keys write append to list ```hocon source { Redis { host = "redis-e2e" port = 6379 auth = "U2VhVHVubmVs" keys = "string_test*" data_type = string batch_size = 33 } } sink { Redis { host = "redis-e2e" port = 6379 auth = "U2VhVHVubmVs" key = "string_test_list" data_type = list batch_size = 33 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Redshift.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Redshift > JDBC Redshift Source Connector ## Description Read external data source data through JDBC. ## Support those engines > Spark
    > Flink
    > Seatunnel Zeta
    ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource list | datasource | supported versions | driver | url | maven | |------------|----------------------------------------------------------|---------------------------------|-----------------------------------------|------------------------------------------------------------------------------------| | redshift | Different dependency version has different driver class. | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [Download](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | ## Database dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
    > For example Redshift datasource: cp RedshiftJDBC42-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | Redshift Data type | Seatunnel Data type | |-------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | SMALLINT
    INT2 | SHORT | | INTEGER
    INT
    INT4 | INT | | BIGINT
    INT8
    OID | LONG | | DECIMAL
    NUMERIC | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | | REAL
    FLOAT4 | FLOAT | | DOUBLE_PRECISION
    FLOAT8
    FLOAT | DOUBLE | | BOOLEAN
    BOOL | BOOLEAN | | CHAR
    CHARACTER
    NCHAR
    BPCHAR
    VARCHAR
    CHARACTER_VARYING
    NVARCHAR
    TEXT
    SUPER | STRING | | VARBYTE
    BINARY_VARYING | BYTES | | TIME
    TIME_WITH_TIME_ZONE
    TIMETZ | LOCALTIME | | TIMESTAMP
    TIMESTAMP_WITH_OUT_TIME_ZONE
    TIMESTAMPTZ | LOCALDATETIME | ## Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:redshift://localhost:5439/dev" driver = "com.amazon.redshift.jdbc.Driver" username = "root" password = "123456" table_path = "public.table2" # Use query filetr rows & columns query = "select id, name from public.table2 where id > 100" #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ### Multiple table read ***Configuring `table_list` will turn on auto split, you can configure `split.*` to adjust the split strategy*** ```hocon env { job.mode = "BATCH" parallelism = 2 } source { Jdbc { url = "jdbc:redshift://localhost:5439/dev" driver = "com.amazon.redshift.jdbc.Driver" username = "root" password = "123456" table_list = [ { table_path = "public.table1" }, { table_path = "public.table2" # Use query filetr rows & columns query = "select id, name from public.table2 where id > 100" } ] #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/RocketMQ.md ================================================ import ChangeLog from '../changelog/connector-rocketmq.md'; # RocketMQ > RocketMQ source connector ## Support Apache RocketMQ Version - 4.9.0 (Or a newer version, for reference) ## Support These Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Source connector for Apache RocketMQ. ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------|---------|----------|----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topics | String | yes | - | `RocketMQ topic` name. If there are multiple `topics`, use `,` to split, for example: `"tpc1,tpc2"`. | | name.srv.addr | String | yes | - | `RocketMQ` name server cluster address. | | tags | String | no | - | `RocketMQ tag` name. If there are multiple `tags`, use `,` to split, for example: `"tag1,tag2"`. | | acl.enabled | Boolean | no | false | If true, access control is enabled, and access key and secret key need to be configured. | | access.key | String | no | | | | secret.key | String | no | | When ACL_ENABLED is true, secret key cannot be empty. | | batch.size | int | no | 100 | `RocketMQ` consumer pull batch size | | consumer.group | String | no | SeaTunnel-Consumer-Group | `RocketMQ consumer group id`, used to distinguish different consumer groups. | | commit.on.checkpoint | Boolean | no | true | If true the consumer's offset will be periodically committed in the background. | | schema | | no | - | The structure of the data, including field names and field types. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | format | String | no | json | Data format. The default format is json. Optional text format. The default field separator is ",".If you customize the delimiter, add the "field.delimiter" option. | | field.delimiter | String | no | , | Customize the field delimiter for data format | | start.mode | String | no | CONSUME_FROM_GROUP_OFFSETS | The initial consumption pattern of consumers,there are several types: [CONSUME_FROM_LAST_OFFSET],[CONSUME_FROM_FIRST_OFFSET],[CONSUME_FROM_GROUP_OFFSETS],[CONSUME_FROM_TIMESTAMP],[CONSUME_FROM_SPECIFIC_OFFSETS] | | start.mode.offsets | | no | | | | start.mode.timestamp | Long | no | | The time required for consumption mode to be "CONSUME_FROM_TIMESTAMP". | | partition.discovery.interval.millis | long | no | -1 | The interval for dynamically discovering topics and partitions. | | ignore_parse_errors | Boolean | no | false | Optional flag to skip parse errors instead of failing. | | common-options | config | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | ### start.mode.offsets The offset required for consumption mode to be "CONSUME_FROM_SPECIFIC_OFFSETS". for example: ```hocon start.mode.offsets = { topic1-0 = 70 topic1-1 = 10 topic1-2 = 10 } ``` ## Task Example ### Simple > Consumer reads Rocketmq data and prints it to the console type ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Rocketmq { name.srv.addr = "rocketmq-e2e:9876" topics = "test_topic_json" plugin_output = "rocketmq_table" schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Console { } } ``` ### Specified format consumption simple > When I consume the topic data in json format parsing and pulling the number of bars each time is 400, the consumption starts from the original location ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" start.mode = "CONSUME_FROM_FIRST_OFFSET" batch.size = "400" consumer.group = "test_topic_group" format = "json" format = json schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Console { } } ``` ### Specified timestamp simple > This is to specify a time to consume, and I dynamically sense the existence of a new partition every 1000 milliseconds to pull the consumption ```hocon env { parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" partition.discovery.interval.millis = "1000" start.mode.timestamp="1694508382000" consumer.group="test_topic_group" format="json" format = json schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Console { } } ``` ### Specified tag example > Here you can specify a tag to consume data. If there are multiple tags, use `,` to separate them, for example: "tag1,tag2" ```hocon env { parallelism = 1 job.mode = "BATCH" # You can set spark configuration here spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local } source { Rocketmq { plugin_output = "rocketmq_table" name.srv.addr = "localhost:9876" topics = "test_topic" format = text # The default field delimiter is "," field_delimiter = "," tags = "test_tag" schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/category/transform } sink { Console { plugin_input = "rocketmq_table" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/S3File.md ================================================ import ChangeLog from '../changelog/connector-file-s3.md'; # S3File > S3 File Source Connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from aws s3 file system. ## Supported DataSource Info | Datasource | Supported versions | |------------|--------------------| | S3 | current | ## Dependency > If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
    > > If you use SeaTunnel Zeta, It automatically integrated the hadoop jar when you download and install SeaTunnel Zeta. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
    > To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.12.692.jar in ${SEATUNNEL_HOME}/lib dir. ## Data Type Mapping Data type mapping is related to the type of file being read, We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` ### JSON File Type If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | ### Text Or CSV File Type If you set the `file_format_type` to `text`,`excel`,`csv`,`xml`. Then it's required to set the `schema` field to tell connector how to parse data to the row. If you set the `schema` field, you should also set the option `field_delimiter`, except the `file_format_type` is `csv`, `xml`, `excel` you can set schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | ### Orc File Type If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. | Orc Data type | SeaTunnel Data type | |----------------------------------|----------------------------------------------------------------| | BOOLEAN | BOOLEAN | | INT | INT | | BYTE | BYTE | | SHORT | SHORT | | LONG | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BINARY | BINARY | | STRING
    VARCHAR
    CHAR
    | STRING | | DATE | LOCAL_DATE_TYPE | | TIMESTAMP | LOCAL_DATE_TIME_TYPE | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType, This type of K and V will transform to SeaTunnel type | | STRUCT | SeaTunnelRowType | ### Parquet File Type If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. | Parquet Data type | SeaTunnel Data type | |----------------------|----------------------------------------------------------------| | INT_8 | BYTE | | INT_16 | SHORT | | DATE | DATE | | TIMESTAMP_MILLIS | TIMESTAMP | | INT64 | LONG | | INT96 | TIMESTAMP | | BINARY | BYTES | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOLEAN | BOOLEAN | | FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType, This type of K and V will transform to SeaTunnel type | | STRUCT | SeaTunnelRowType | ## Options | name | type | required | default value | Description | |---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | | file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` | | bucket | string | yes | - | The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. | | fs.s3a.endpoint | string | yes | - | fs s3a endpoint | | fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) | | read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` `xml` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. | | access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | | secret_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | | hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | | delimiter/field_delimiter | string | no | \001 for text and , for csv | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | | row_delimiter | string | no | \n | Row delimiter, used to tell connector how to slice and dice rows when reading text files. Default `\n`. | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | | time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` | | skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | schema | config | no | - | The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). | | sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only valid for XML files. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only valid for XML files. | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | compress_codec | string | no | none | | | archive_compress_codec | string | no | none | | | enable_file_split | boolean | no | false | Turn on logical file split to improve parallelism for huge files. Only supported for `text`/`csv`/`json`/`parquet` and non-compressed format. | | file_split_size | long | no | 134217728 | Split size in bytes when `enable_file_split=true`. For `text`/`csv`/`json`, the split end will be aligned to the next `row_delimiter`. For `parquet`, the split unit is RowGroup and will never break a RowGroup. | | encoding | string | no | UTF-8 | | | null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` | | binary_chunk_size | int | no | 1024 | Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. | | binary_complete_file_mode | boolean | no | false | Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. | | file_filter_pattern | string | no | | Filter pattern, which used for filtering files. | | filename_extension | string | no | - | Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | | quote_char | string | no | " | A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. | | escape_char | string | no | - | A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. | | metalake_type | string | no | gravitino | The type of metalake service, currently supports `gravitino`. | ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### delimiter/field_delimiter [string] **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. ### row_delimiter [string] Only need to be configured when file_format is text Row delimiter, used to tell connector how to slice and dice rows default `\n` ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### enable_file_split [boolean] Turn on the file splitting function, the default is false. It can be selected when the file type is csv, text, json, parquet and non-compressed format. - `text`/`csv`/`json`: split by `file_split_size` and align to the next `row_delimiter` to avoid breaking records. - `parquet`: split by RowGroup (logical split), never breaks a RowGroup. **Recommendations** - Enable when reading a few large files and you want higher read parallelism. - Disable when reading many small files, or when parallelism is low (splitting adds overhead). **Limitations** - Not supported for compressed files (`compress_codec` != `none`) or archive files (`archive_compress_codec` != `none`) — it will fall back to non-splitting and emit a warning log. - For `text`/`csv`/`json`, actual split size may be larger than `file_split_size` because the split end is aligned to the next `row_delimiter`. - For `json`, splitting is only supported for JSON Lines (one JSON object per line). - When splitting is enabled, global record order is not guaranteed because splits can be processed in parallel. Set `parallelism=1` if strict ordering is required. ### file_split_size [long] File split size, which can be filled in when the enable_file_split parameter is true. The unit is the number of bytes. The default value is the number of bytes of 128MB, which is 134217728. **Tuning** - Start with the default (128MB). Decrease it if parallelism is under-utilized; increase it if the number of splits is too large. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### schema [config] #### fields [Config] The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### schema_url [string] Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). ### metalake_type [string] The type of metalake service, currently only supports `gravitino`. When using `schema_url` to obtain metadata from Gravitino, you can specify this parameter (default is `gravitino`). For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). ## Example 1. In this example, We read data from s3 path `s3a://seatunnel-test/seatunnel/text` and the file type is orc in this path. We use `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` to authentication so `access_key` and `secret_key` is required. All columns in the file will be read and send to sink. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" file_format_type = "orc" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms } sink { Console {} } ``` 2. Use `InstanceProfileCredentialsProvider` to authentication The file type in S3 is json, so need config schema option. ```hocon S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" schema { fields { id = int name = string } } } ``` 3. Use `InstanceProfileCredentialsProvider` to authentication The file type in S3 is json and has five fields (`id`, `name`, `age`, `sex`, `type`), so need config schema option. In this job, we only need send `id` and `name` column to mysql. ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" read_columns = ["id", "name"] schema { fields { id = int name = string age = int sex = int type = string } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms } sink { Console {} } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" read_columns = ["id", "name"] // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/SftpFile.md ================================================ import ChangeLog from '../changelog/connector-file-sftp.md'; # SftpFile > Sftp file source connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [multimodal](../../introduction/concepts/connector-v2-features.md#multimodal) Use binary file format to read and write files in any format, such as videos, pictures, etc. In short, any files can be synchronized to the target place. - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) - [x] file format type - [x] text - [x] csv - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## Description Read data from sftp file server. ## Supported DataSource Info In order to use the SftpFile connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Dependency | |------------|--------------------|-----------------------------------------------------------------------------------------| | SftpFile | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-file-sftp) | :::tip If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. We made some trade-offs in order to support more file types, so we used the HDFS protocol for internal access to Sftp and this connector need some hadoop dependencies. It only supports hadoop version **2.9.X+**. ::: ## Data Type Mapping The File does not have a specific type list, and we can indicate which SeaTunnel data type the corresponding data needs to be converted to by specifying the Schema in the config. | SeaTunnel Data type | |---------------------| | STRING | | SHORT | | INT | | BIGINT | | BOOLEAN | | DOUBLE | | DECIMAL | | FLOAT | | DATE | | TIME | | TIMESTAMP | | BYTES | | ARRAY | | MAP | ## Source Options | Name | Type | Required | default value | Description | |----------------------------|---------|----------|-------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | - | The target sftp host is required | | port | Int | Yes | - | The target sftp port is required | | user | String | Yes | - | The target sftp username is required | | password | String | Yes | - | The target sftp password is required | | path | String | Yes | - | The source file path. | | file_format_type | String | Yes | - | Please check #file_format_type below | | file_filter_pattern | String | No | - | Filter pattern, which used for filtering files. | | filename_extension | string | no | - | Filter filename extension, which used for filtering files with specific extension. Example: `csv` `.txt` `json` `.xml`. | | delimiter/field_delimiter | String | No | \001 for text and ',' for csv | **delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead.
    Field delimiter, used to tell connector how to slice and dice fields when reading text files.
    Default `\001`, the same as hive's default delimiter | | row_delimiter | string | no | \n | Row delimiter, used to tell connector how to slice and dice rows when reading text files.
    Default `\n` | | parse_partition_from_path | Boolean | No | true | Control whether parse the partition keys and values from file path
    For example if you read a file from path `oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
    Every record data from file will be added these two fields:
    name age
    tyrantlucifer 26
    Tips: **Do not define partition fields in schema option** | | date_format | String | No | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:
    `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
    default `yyyy-MM-dd` | | datetime_format | String | No | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
    `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
    default `yyyy-MM-dd HH:mm:ss` | | time_format | String | No | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:
    `HH:mm:ss` `HH:mm:ss.SSS`
    default `HH:mm:ss` | | skip_header_row_number | Long | No | 0 | Skip the first few lines, but only for the txt and csv.
    For example, set like following:
    `skip_header_row_number = 2`
    then SeaTunnel will skip the first 2 lines from source files | | read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. | | sheet_name | String | No | - | Reader the sheet of the workbook,Only used when file_format is excel. | | xml_row_tag | string | no | - | Specifies the tag name of the data rows within the XML file, only used when file_format is xml. | | xml_use_attr_format | boolean | no | - | Specifies whether to process data using the tag attribute format, only used when file_format is xml. | | csv_use_header_line | boolean | no | false | Whether to use the header line to parse the file, only used when the file_format is `csv` and the file contains the header line that match RFC 4180 | | schema | Config | No | - | Please check #schema below | | compress_codec | String | No | None | The compress codec of files and the details that supported as the following shown:
    - txt: `lzo` `None`
    - json: `lzo` `None`
    - csv: `lzo` `None`
    - orc: `lzo` `snappy` `lz4` `zlib` `None`
    - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `None`
    Tips: excel type does Not support any compression format | | archive_compress_codec | string | no | none | | | encoding | string | no | UTF-8 | | | null_format | string | no | - | Only used when file_format_type is text. null_format to define which strings can be represented as null. e.g: `\N` | | binary_chunk_size | int | no | 1024 | Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. | | binary_complete_file_mode | boolean | no | false | Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. | | sync_mode | string | no | full | File sync mode. Supported values: `full`, `update`. When `update`, the source compares files between source/target and only reads new/changed files (currently only supports `file_format_type=binary`). | | target_path | string | no | - | Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). | | target_hadoop_conf | map | no | - | Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem. You can set `fs.defaultFS` in this map to override target defaultFS. | | update_strategy | string | no | distcp | Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. | | compare_mode | string | no | len_mtime | Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum` (only valid when `update_strategy=strict`). | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | | file_filter_modified_start | string | no | - | File modification time filter. The connector will filter some files base on the last modification start time (include start time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | file_filter_modified_end | string | no | - | File modification time filter. The connector will filter some files base on the last modification end time (not include end time). The default data format is `yyyy-MM-dd HH:mm:ss`. | | quote_char | string | no | " | A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. | | escape_char | string | no | - | A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. | | metalake_type | string | no | gravitino | The type of metalake service, currently supports `gravitino`. | ### file_filter_pattern [string] Filter pattern, which used for filtering files. If you only want to filter based on file names, simply write the regular file names; If you want to filter based on the file directory at the same time, the expression needs to start with `path`. The pattern follows standard regular expressions. For details, please refer to https://en.wikipedia.org/wiki/Regular_expression. There are some examples. If the `path` is `/data/seatunnel`, and the file structure example is: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` Matching Rules Example: **Example 1**: *Match all .txt files*,Regular Expression: ``` .*.txt ``` The result of this example matching is: ``` /data/seatunnel/20241001/report.txt ``` **Example 2**: *Match all file starting with abc*,Regular Expression: ``` abc.* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **Example 3**: *Match all files starting with abc in folder 20241007,And the fourth character is either h or g*, the Regular Expression: ``` /data/seatunnel/20241007/abc[h,g].* ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv ``` **Example 4**: *Match third level folders starting with 202410 and files ending with .csv*, the Regular Expression: ``` /data/seatunnel/202410\d*/.*.csv ``` The result of this example matching is: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### file_format_type [string] File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: upstream data is the following: ```json {"code": 200, "data": "get success", "success": true} ``` You can also save multiple pieces of data in one file and split them by newline: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` you should assign schema as the following: ```hocon schema { fields { code = int data = string success = boolean } } ``` connector will generate data as the following: | code | data | success | |------|-------------|---------| | 200 | get success | true | If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. If you assign file type to `text` `csv`, you can choose to specify the schema information or not. For example, upstream data is the following: ```text tyrantlucifer#26#male ``` If you do not assign data schema connector will treat the upstream data as the following: | content | |-----------------------| | tyrantlucifer#26#male | If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` connector will generate data as the following: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | If you assign file type to `binary`, SeaTunnel can synchronize files in any format, such as compressed packages, pictures, etc. In short, any files can be synchronized to the target place. Under this requirement, you need to ensure that the source and sink use `binary` format for file synchronization at the same time. If you assign file type to `markdown`, SeaTunnel can parse markdown files and extract structured data. The markdown parser extracts various elements including headings, paragraphs, lists, code blocks, tables, and more. Each element is converted to a row with the following schema: - `element_id`: Unique identifier for the element - `element_type`: Type of the element (Heading, Paragraph, ListItem, etc.) - `heading_level`: Level of heading (1-6, null for non-heading elements) - `text`: Text content of the element - `page_number`: Page number (default: 1) - `position_index`: Position index within the document - `parent_id`: ID of the parent element - `child_ids`: Comma-separated list of child element IDs Note: Markdown format only supports reading, not writing. ### compress_codec [string] The compress codec of files and the details that supported as the following shown: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: automatically recognizes the compression type, no additional settings required. ### archive_compress_codec [string] The compress codec of archive files and the details that supported as the following shown: | archive_compress_codec | file_format | archive_compress_suffix | |--------------------|--------------------|---------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | Note: gz compressed excel file needs to compress the original file or specify the file suffix, such as e2e.xls ->e2e_test.xls.gz ### encoding [string] Only used when file_format_type is json,text,csv,xml. The encoding of the file to read. This param will be parsed by `Charset.forName(encoding)`. ### binary_chunk_size [int] Only used when file_format_type is binary. The chunk size (in bytes) for reading binary files. Default is 1024 bytes. Larger values may improve performance for large files but use more memory. ### binary_complete_file_mode [boolean] Only used when file_format_type is binary. Whether to read the complete file as a single chunk instead of splitting into chunks. When enabled, the entire file content will be read into memory at once. Default is false. ### sync_mode [string] File sync mode. Supported values: `full` (default), `update`. When `update`, the source compares files between source/target and only reads new/changed files (currently only supports `file_format_type=binary`). **Performance considerations** - Update mode triggers an extra `getFileStatus` call on the target for each source file. - For remote file systems (FTP/SFTP), this adds per-file network overhead. It is not recommended for massive small-file scenarios. **Requirements / limitations** - `target_path` should typically align with sink `path` (same filesystem and same relative path layout). - When `update_strategy=distcp`, correctness depends on source/target clock synchronization. - When `compare_mode=checksum`, filesystem checksum support is required. If checksum is unavailable, SeaTunnel falls back to content comparison (more expensive) and logs a warning. Example: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] Only used when `sync_mode=update`. Target base path used for comparison (it should usually be the same as sink `path`). ### target_hadoop_conf [map] Only used when `sync_mode=update`. Extra Hadoop configuration for target filesystem. You can set `fs.defaultFS` in this map to override target defaultFS. ### update_strategy [string] Only used when `sync_mode=update`. Supported values: `distcp` (default), `strict`. ### compare_mode [string] Only used when `sync_mode=update`. Supported values: `len_mtime` (default), `checksum` (only valid when `update_strategy=strict`). ### quote_char [string] A single character that encloses CSV fields, allowing fields with commas, line breaks, or quotes to be read correctly. ### escape_char [string] A single character that allows the quote or other special characters to appear inside a CSV field without ending the field. ### schema [config] #### fields [Config] The schema of upstream data. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). #### schema_url [string] Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](../../introduction/concepts/gravitino-type-mapping.md). ### metalake_type [string] The type of metalake service, currently only supports `gravitino`. When using `schema_url` to obtain metadata from Gravitino, you can specify this parameter (default is `gravitino`). For more information about Metalake, please refer to [Metalake](../../introduction/concepts/metalake.md). ## How to Create a Sftp Data Synchronization Jobs The following example demonstrates how to create a data synchronization job that reads data from sftp and prints it on the local client: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to connect to sftp source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/read/json" file_format_type = "json" plugin_output = "sftp" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } } # Console printing of the read sftp data sink { Console { parallelism = 1 } } ``` ### Multiple Table ```hocon SftpFile { tables_configs = [ { schema { table = "student" fields { name = string age = int } } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" }, { schema { table = "teacher" fields { name = string age = int } } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" } ] } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/read/json" file_format_type = "json" plugin_output = "sftp" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ### Incremental Sync (sync_mode=update, binary) `sync_mode=update` compares files between source and `target_path`, then only reads new/changed files. In most cases, `target_path` should be aligned with sink `path` (same filesystem and same relative paths). ```hocon env { parallelism = 1 job.mode = "BATCH" } source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/update/src" file_format_type = "binary" sync_mode = "update" target_path = "tmp/seatunnel/update/dst" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/update/dst" tmp_path = "tmp/seatunnel/update/tmp" file_format_type = "binary" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Sls.md ================================================ import ChangeLog from '../changelog/connector-sls.md'; # Sls > Sls source connector ## Support Those Engines > Spark
    > Flink
    > Seatunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Source connector for Aliyun Sls. ## Supported DataSource Info In order to use the Sls connector, the following dependencies are required. They can be downloaded via install-plugin.sh or from the Maven central repository. | Datasource | Supported Versions | Maven | |------------|--------------------|-----------------------------------------------------------------------------------| | Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-sls) | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------|---------------------------------------------|----------|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------| | project | String | Yes | - | [Aliyun Sls Project](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | | logstore | String | Yes | - | [Aliyun Sls Logstore](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | | endpoint | String | Yes | - | [Aliyun Access Endpoint](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | | access_key_id | String | Yes | - | [Aliyun AccessKey ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | access_key_secret | String | Yes | - | [Aliyun AccessKey Secret](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | start_mode | StartMode[earliest],[group_cursor],[latest] | No | group_cursor | The initial consumption pattern of consumers. | | consumer_group | String | No | SeaTunnel-Consumer-Group | Sls consumer group id, used to distinguish different consumer groups. | | auto_cursor_reset | CursorMode[begin],[end] | No | end | When there is no cursor in the consumer group, cursor initialization occurs | | batch_size | Int | No | 1000 | The amount of data pulled from SLS each time | | partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. | ## Task Example ### Simple > This example reads the data of sls's logstore1 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../getting-started/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) to run this job. [Create RAM user and authorization](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4),Please ensure thr ram user have sufficient rights to perform, reference [RAM Custom Authorization Example](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 30000 } source { Sls { endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" project = "project1" logstore = "logstore1" access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" schema = { fields = { id = "int" name = "string" description = "string" weight = "string" } } } } sink { Console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Snowflake.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Snowflake > JDBC Snowflake Source Connector > > ## Support those engines > > Spark
    > Flink
    > SeaTunnel Zeta
    > ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. > ## Description Read external data source data through JDBC. ## Supported DataSource list | datasource | supported versions | driver | url | maven | |------------|----------------------------------------------------------|-------------------------------------------|------------------------------------------------------------|-----------------------------------------------------------------------------| | snowflake | Different dependency version has different driver class. | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [Download](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | ## Database dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
    > For example Snowflake datasource: cp snowflake-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ > ## Data Type Mapping | Snowflake Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT
    BYTEINT
    | SHORT_TYPE | | INT
    INTEGER
    | INT | | BIGINT | LONG | | DECIMAL
    NUMERIC
    NUMBER
    | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | REAL
    FLOAT4 | FLOAT | | DOUBLE
    DOUBLE PRECISION
    FLOAT8
    FLOAT
    | DOUBLE | | CHAR
    CHARACTER
    VARCHAR
    STRING
    TEXT
    VARIANT
    OBJECT | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP
    TIMESTAMP_LTZ
    TIMESTAMP_NTZ
    TIMESTAMP_TZ | TIMESTAMP | | BINARY
    VARBINARY | BYTES | | GEOGRAPHY (WKB or EWKB)
    GEOMETRY (WKB or EWKB) | BYTES | | GEOGRAPHY (GeoJSON, WKT or EWKT)
    GEOMETRY (GeoJSON, WKB or EWKB) | STRING | ## Options | name | type | required | default | description | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. > > JDBC Driver Connection Parameters are supported in JDBC connection string. E.g, you can add `?GEOGRAPHY_OUTPUT_FORMAT='EWKT'` to specify the Geospatial Data Types. For more information about configurable parameters, and geospatial data types please visit Snowflake official [document](https://docs.snowflake.com/en/sql-reference/data-types-geospatial) ## Task Example ### simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### parallel > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } ``` ### parallel boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Socket.md ================================================ import ChangeLog from '../changelog/connector-socket.md'; # Socket > Socket source connector ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Used to read data from Socket. ## Data Type Mapping The File does not have a specific type list, and we can indicate which SeaTunnel data type the corresponding data needs to be converted to by specifying the Schema in the config. | SeaTunnel Data type | |---------------------| | STRING | | SHORT | | INT | | BIGINT | | BOOLEAN | | DOUBLE | | DECIMAL | | FLOAT | | DATE | | TIME | | TIMESTAMP | | BYTES | | ARRAY | | MAP | ## Options | Name | Type | Required | Default | Description | |----------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------| | host | String | Yes | _ | socket server host | | port | Integer | Yes | _ | socket server port | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | ## How to Create a Socket Data Synchronization Jobs * Configuring the SeaTunnel config file The following example demonstrates how to create a data synchronization job that reads data from Socket and prints it on the local client: ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" } # Create a source to connect to socket source { Socket { host = "localhost" port = 9999 } } # Console printing of the read socket data sink { Console { parallelism = 1 } } ``` * Start a port listening ```shell nc -l 9999 ``` * Start a SeaTunnel task * Socket Source send test data ```text ~ nc -l 9999 test hello flink spark ``` * Console Sink print data ```text [test] [hello] [flink] [spark] ``` ## Changelog ================================================ FILE: docs/en/connectors/source/SqlServer-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-sqlserver.md'; # SQL Server CDC > Sql Server CDC source connector ## Support SQL Server Version - server:2019 (Or later version for information only) ## Support Those Engines > SeaTunnel Zeta
    > Flink
    ## Key Features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The Sql Server CDC connector allows for reading snapshot data and incremental data from SqlServer database. This document describes how to setup the Sql Server CDC connector to run SQL queries against SqlServer databases. :::tip When discovering table columns via JDBC metadata, SeaTunnel filters metadata rows by the exact schema/table identifier to avoid mixing columns from other tables (some drivers treat `schemaPattern`/`tableNamePattern` as SQL LIKE patterns). For case-sensitive databases, make sure the configured identifier case matches the database. ::: ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|---------------------------------------------------------------|----------------------------------------------|---------------------------------------------------------------|-----------------------------------------------------------------------| | SqlServer |
  • server:2019 (Or later version for information only)
  • | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433;databaseName=column_type_test | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | ## Using Dependency ### Install Jdbc Driver #### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. #### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Data Type Mapping | SQLserver Data Type | SeaTunnel Data Type | |----------------------------------------------------------------------|---------------------| | CHAR
    VARCHAR
    NCHAR
    NVARCHAR
    TEXT
    NTEXT
    XML | STRING | | BINARY
    VARBINARY
    IMAGE | BYTES | | INTEGER
    INT | INT | | SMALLINT
    TINYINT | SMALLINT | | BIGINT | BIGINT | | FLOAT(1~24)
    REAL | FLOAT | | DOUBLE
    FLOAT(>24) | DOUBLE | | NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p, s) | | TIMESTAMP | BYTES | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | | BOOLEAN
    BIT
    | BOOLEAN | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | username | String | Yes | - | Name of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | database-names | List | Yes | - | Database name of the database to monitor. | | table-names | List | Yes | - | Table name is a combination of schema name and table name (databaseName.schemaName.tableName). | | table-names-config | List | No | - | Table config list. for example: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | url | String | Yes | - | URL has to be with database, like "jdbc:sqlserver://localhost:1433;databaseName=test". | | startup.mode | Enum | No | INITIAL | Optional startup mode for SqlServer CDC consumer, valid enumerations are "initial", "earliest", "latest", "timestamp" and "specific". | | startup.timestamp | Long | No | - | Start from the specified epoch timestamp (in milliseconds). This timestamp is converted with `server-time-zone` when `startup.mode = timestamp`.
    **Note, This option is required when** the **"startup.mode" option used `'timestamp'`.** | | startup.specific-offset.file | String | No | - | Start from the specified binlog file name.
    **Note, This option is required when the "startup.mode" option used `'specific'`.** | | startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position.
    **Note, This option is required when the "startup.mode" option used `'specific'`.** | | stop.mode | Enum | No | NEVER | Optional stop mode for SqlServer CDC consumer, valid enumerations are "never". | | stop.timestamp | Long | No | - | Stop from the specified epoch timestamp (in milliseconds).
    **Note, This option is required when the "stop.mode" option used `'timestamp'`.** | | stop.specific-offset.file | String | No | - | Stop from the specified binlog file name.
    **Note, This option is required when the "stop.mode" option used `'specific'`.** | | stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position.
    **Note, This option is required when the "stop.mode" option used `'specific'`.** | | incremental.parallelism | Integer | No | 1 | The number of parallel readers in the incremental phase. | | snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshotof table. | | snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. | | server-time-zone | String | No | UTC | The session time zone in database server. This value is also used when converting `startup.timestamp` to LSN. Set it explicitly when database time zone and JVM time zone are different. | | connect.timeout | Duration | No | 30s | The maximum time that the connector should wait after trying to connect to the database server before timing out. | | connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. | | connection.pool.size | Integer | No | 20 | The connection pool size. | | chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | sample-sharding.threshold | int | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | inverse-sampling.rate | int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | exactly_once | Boolean | No | false | Enable exactly once semantic. | | debezium.* | config | No | - | Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from SqlServer server.
    See more about
    the [Debezium's SqlServer Connector properties](https://github.com/debezium/debezium/blob/1.6/documentation/modules/ROOT/pages/connectors/sqlserver.adoc#connector-properties) | | format | Enum | No | DEFAULT | Optional output format for SqlServer CDC, valid enumerations are "DEFAULT"、"COMPATIBLE_DEBEZIUM_JSON". | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details. | ### Enable Sql Server CDC 1. Check whether the CDC Agent is enabled > EXEC xp_servicecontrol N'querystate', N'SQLServerAGENT';
    > If the result is running, prove that it is enabled. Otherwise, you need to manually enable it 2. Enable the CDC Agent > /opt/mssql/bin/mssql-conf setup 3. The result is as follows > 1) Evaluation (free, no production use rights, 180-day limit) > 2) Developer (free, no production use rights) > 3) Express (free) > 4) Web (PAID) > 5) Standard (PAID) > 6) Enterprise (PAID) > 7) Enterprise Core (PAID) > 8) I bought a license through a retail sales channel and have a product key to enter. 4. Set the CDC at the library level Set the library level below to enable CDC. At this level, all tables under the libraries of the enabled CDC automatically enable CDC > USE TestDB; -- Replace with the actual database name
    > EXEC sys.sp_cdc_enable_db;
    > SELECT name, is_tracked_by_cdc FROM sys.tables WHERE name = 'table'; -- table Replace with the name of the table you want to check ## Task Example ### initiali read Simple > This is a stream mode cdc initializes read table data will be read incrementally after successful read The following sql DDL is for reference only ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** SqlServer-CDC { plugin_output = "customers" username = "sa" password = "Y.sa123456" startup.mode="initial" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.full_types"] url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" } } transform { } sink { console { plugin_input = "customers" } } ``` ### increment read Simple > This is an incremental read that reads the changed data for printing ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** SqlServer-CDC { # Set up accurate one read exactly_once=true plugin_output = "customers" username = "sa" password = "Y.sa123456" startup.mode="latest" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.full_types"] url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" } } transform { } sink { console { plugin_input = "customers" } } ``` ### Support custom primary key for table ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { SqlServer-CDC { url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = "sa" password = "Y.sa123456" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.simple_types", "column_type_test.dbo.full_types"] table-names-config = [ { table = "column_type_test.dbo.full_types" primaryKeys = ["id"] } ] } } sink { console { } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/SqlServer.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # SQL Server > JDBC SQL Server Source Connector ## Support SQL Server Version - server:2008 (Or later version for information only) ## Support Those Engines > Spark
    > Flink
    > Seatunnel Zeta
    ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Description Read external data source data through JDBC. ## Supported DataSource Info | datasource | supported versions | driver | url | maven | |------------|-------------------------|----------------------------------------------|---------------------------------|-----------------------------------------------------------------------------------| | SQL Server | support version >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [Download](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | ## Database dependency > Please download the support list corresponding to 'Maven' and copy it to the '$SEATUNNEL_HOME/plugins/jdbc/lib/' working directory
    > For example SQL Server datasource: cp mssql-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## Data Type Mapping | SQLserver Data type | Seatunnel Data type | |----------------------------------------------------------------------|---------------------| | BIT | BOOLEAN | | TINYINT
    SMALLINT | SMALLINT | | INTEGER
    INT | INT | | BIGINT | BIGINT | | NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p,s) | | FLOAT(1~24)
    REAL | FLOAT | | DOUBLE
    FLOAT(>24) | DOUBLE | | CHAR
    NCHAR
    VARCHAR
    NTEXT
    NVARCHAR
    TEXT
    XML | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | | BINARY
    VARBINARY
    IMAGE | BYTES | ## Source Options | name | type | required | default | Description | |--------------------------------------------|--------|----------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:sqlserver://127.0.0.1:1434;database=TestDB | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use SQLserver the value is `com.microsoft.sqlserver.jdbc.SQLServerDriver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type. | | partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | use_regex | Boolean| No | false | Control regular expression matching for table_path. When set to `true`, the table_path will be treated as a regular expression pattern. When set to `false` or not specified, the table_path will be treated as an exact path (no regex matching). | | table_path | String | No | - | The path to the full path of table, you can use this configuration instead of `query`.
    example:
    "testdb.test_schema.table1" | | table_list | Array | No | - | The list of tables to be read, you can use this configuration instead of `table_path` example: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | No | - | Common row filter conditions for all tables/queries, must start with `where`. for example `where id > 100` | | split.size | Int | No | 8096 | The split size (number of rows) of table, captured tables are split into multiple splits when read of table. | | split.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. | | split.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. | | split.sample-sharding.threshold | Int | No | 10000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. | | split.inverse-sampling.rate | Int | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ## Parallel Reader The JDBC Source connector supports parallel reading of data from tables. SeaTunnel will use certain rules to split the data in the table, which will be handed over to readers for reading. The number of readers is determined by the `parallelism` option. **Split Key Rules:** 1. If `partition_column` is not null, It will be used to calculate split. The column must in **Supported split data type**. 2. If `partition_column` is null, seatunnel will read the schema from table and get the Primary Key and Unique Index. If there are more than one column in Primary Key and Unique Index, The first column which in the **supported split data type** will be used to split data. For example, the table have Primary Key(nn guid, name varchar), because `guid` id not in **supported split data type**, so the column `name` will be used to split data. **Supported split data type:** * String * Number(int, bigint, decimal, ...) * Date ### Options Related To Split #### split.size How many rows in one split, captured tables are split into multiple splits when read of table. #### split.even-distribution.factor.lower-bound > Not recommended for use The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. #### split.even-distribution.factor.upper-bound > Not recommended for use The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. #### split.sample-sharding.threshold This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. #### split.inverse-sampling.rate The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. #### partition_column [string] The column name for split data. #### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. #### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. #### partition_num [int] > Not recommended for use, The correct approach is to control the number of split through `split.size` How many splits do we need to split into, only support positive integer. default value is job parallelism. ## tips > If the table can not be split(for example, table have no Primary Key or Unique Index, and `partition_column` is not set), it will run in single concurrency. > > Use `table_path` to replace `query` for single table reading. If you need to read multiple tables, use `table_list`. ## Task Example ### Simple > Simple single task to read the data table ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source{ Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from full_types_jdbc" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" # Define query logic as required query = "select * from full_types_jdbc" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### Fragmented Parallel Read Simple > It is a shard that reads data in parallel fast ``` env { # You can set engine configuration here parallelism = 10 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from column_type_test.dbo.full_types_jdbc" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/source/Jdbc } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connector-v2/sink/Jdbc } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/StarRocks.md ================================================ import ChangeLog from '../changelog/connector-starrocks.md'; # StarRocks > StarRocks source connector ## Description Read external data source data through StarRocks. The internal implementation of StarRocks source connector is obtains the query plan from the frontend (FE), delivers the query plan as a parameter to BE nodes, and then obtains data results from BE nodes. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [schema projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-------------------------|---------|----------|-------------------| | nodeUrls | list | yes | - | | username | string | yes | - | | password | string | yes | - | | database | string | yes | - | | table | string | no | - | | scan_filter | string | no | - | | schema | config | yes | - | | table_list | array | no | - | | request_tablet_size | int | no | Integer.MAX_VALUE | | scan_connect_timeout_ms | int | no | 30000 | | scan_query_timeout_sec | int | no | 3600 | | scan_keep_alive_min | int | no | 10 | | scan_batch_rows | int | no | 1024 | | scan_mem_limit | long | no | 2147483648 | | max_retries | int | no | 3 | | scan.params.* | string | no | - | ### nodeUrls [list] `StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` ### username [string] `StarRocks` user username ### password [string] `StarRocks` user password ### database [string] The name of StarRocks database ### table [string] The name of StarRocks table ### scan_filter [string] Filter expression of the query, which is transparently transmitted to StarRocks. StarRocks uses this expression to complete source-side data filtering. e.g. ``` "tinyint_1 = 100" ``` ### schema [config] #### fields [Config] The schema of the starRocks that you want to generate. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). e.g. ``` schema { fields { name = string age = int } } ``` ### table_list [array] The list of tables to be read, you can use this configuration instead of `table` ### request_tablet_size [int] The number of StarRocks Tablets corresponding to an Partition. The smaller this value is set, the more partitions will be generated. This will increase the parallelism on the engine side, but at the same time will cause greater pressure on StarRocks. The following is an example to explain how to use request_tablet_size to controls the generation of partitions ``` the tablet distribution of StarRocks table in cluster as follower be_node_1 tablet[1, 2, 3, 4, 5] be_node_2 tablet[6, 7, 8, 9, 10] be_node_3 tablet[11, 12, 13, 14, 15] 1.If not set request_tablet_size, there will no limit on the number of tablets in a single partition. The partitions will be generated as follows partition[0] read data of tablet[1, 2, 3, 4, 5] from be_node_1 partition[1] read data of tablet[6, 7, 8, 9, 10] from be_node_2 partition[2] read data of tablet[11, 12, 13, 14, 15] from be_node_3 2.if set request_tablet_size=3, the limit on the number of tablets in a single partition is 3. The partitions will be generated as follows partition[0] read data of tablet[1, 2, 3] from be_node_1 partition[1] read data of tablet[4, 5] from be_node_1 partition[2] read data of tablet[6, 7, 8] from be_node_2 partition[3] read data of tablet[9, 10] from be_node_2 partition[4] read data of tablet[11, 12, 13] from be_node_3 partition[5] read data of tablet[14, 15] from be_node_3 ``` ### scan_connect_timeout_ms [int] requests connection timeout sent to StarRocks ### scan_query_timeout_sec [int] Query the timeout time of StarRocks, the default value is 1 hour, -1 means no timeout limit ### scan_keep_alive_min [int] The keep-alive duration of the query task, in minutes. The default value is 10. we recommend that you set this parameter to a value greater than or equal to 5. ### scan_batch_rows [int] The maximum number of data rows to read from BE at a time. Increasing this value reduces the number of connections established between engine and StarRocks and therefore mitigates overhead caused by network latency. ### scan_mem_limit [long] The maximum memory space allowed for a single query in the BE node, in bytes. The default value is 2147483648 (2 GB). ### max_retries [int] number of retry requests sent to StarRocks ### scan.params. [string] The parameter of the scan data from be ## Example ``` source { StarRocks { nodeUrls = ["starrocks_e2e:8030"] username = root password = "" database = "test" table = "e2e_table_source" scan_batch_rows = 10 max_retries = 3 schema { fields { BIGINT_COL = BIGINT LARGEINT_COL = STRING SMALLINT_COL = SMALLINT TINYINT_COL = TINYINT BOOLEAN_COL = BOOLEAN DECIMAL_COL = "DECIMAL(20, 1)" DOUBLE_COL = DOUBLE FLOAT_COL = FLOAT INT_COL = INT CHAR_COL = STRING VARCHAR_11_COL = STRING STRING_COL = STRING DATETIME_COL = TIMESTAMP DATE_COL = DATE } } scan.params.scanner_thread_pool_thread_num = "3" } } ``` ## Example 2: Multiple tables ``` source { StarRocks { nodeUrls = ["starrocks_e2e:8030"] username = root password = "" database = "test" table_list = [ { table = "e2e_table_source" schema = { fields { BIGINT_COL = BIGINT LARGEINT_COL = STRING SMALLINT_COL = SMALLINT TINYINT_COL = TINYINT BOOLEAN_COL = BOOLEAN DECIMAL_COL = "DECIMAL(20, 1)" DOUBLE_COL = DOUBLE FLOAT_COL = FLOAT INT_COL = INT CHAR_COL = STRING VARCHAR_11_COL = STRING STRING_COL = STRING DATETIME_COL = TIMESTAMP DATE_COL = DATE } } }, { table = "e2e_table_source_2" schema = { fields { BIGINT_COL_2 = BIGINT LARGEINT_COL_2 = STRING SMALLINT_COL_2 = SMALLINT TINYINT_COL_2 = TINYINT BOOLEAN_COL_2 = BOOLEAN DECIMAL_COL_2 = "DECIMAL(20, 1)" DOUBLE_COL_2 = DOUBLE FLOAT_COL_2 = FLOAT INT_COL_2 = INT CHAR_COL_2 = STRING VARCHAR_11_COL_2 = STRING STRING_COL_2 = STRING DATETIME_COL_2 = TIMESTAMP DATE_COL_2 = DATE } } }] scan_batch_rows = 10 max_retries = 3 scan.params.scanner_thread_pool_thread_num = "3" } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/TDengine.md ================================================ import ChangeLog from '../changelog/connector-tdengine.md'; # TDengine > TDengine source connector ## Description Read external data source data through TDengine. ## Key features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) supports query SQL and can achieve projection effect. - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |--------------|--------|----------|---------------| | url | string | yes | - | | username | string | yes | - | | password | string | yes | - | | database | string | yes | | | stable | string | yes | - | | sub_tables | list | no | - | | lower_bound | long | yes | - | | upper_bound | long | yes | - | | read_columns | list | no | - | ### url [string] the url of the TDengine when you select the TDengine e.g. ``` jdbc:TAOS-RS://localhost:6041/ ``` ### username [string] the username of the TDengine when you select ### password [string] the password of the TDengine when you select ### database [string] the database of the TDengine when you select ### stable [string] the stable of the TDengine when you select ### sub_tables [list] A list of sub_table names. If not specified, all sub-tables will be selected. If specified, only the specified sub-tables will be selected. ### lower_bound [long] the lower_bound of the migration period ### upper_bound [long] the upper_bound of the migration period ### read_columns [list] A list of column names to read. If not specified, all columns will be selected. When reading from a super table, please make sure to put the TAGS columns at the end of the list. ## Example ### source ```hocon source { TDengine { url : "jdbc:TAOS-RS://localhost:6041/" username : "root" password : "taosdata" database : "power" stable : "meters" sub_tables : ["meter_1","meter_2"] lower_bound : "2018-10-03 14:38:05.000" upper_bound : "2018-10-03 14:38:16.800" plugin_output : "tdengine_result" read_columns : ["ts","voltage","current","power"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Tablestore.md ================================================ import ChangeLog from '../changelog/connector-tablestore.md'; # Tablestore > Tablestore source connector ## Description Read data from Alicloud Tablestore,support full and CDC. ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [X] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Options | name | type | required | default value | |-----------------------|--------|----------|---------------| | end_point | string | yes | - | | instance_name | string | yes | - | | access_key_id | string | yes | - | | access_key_secret | string | yes | - | | table | string | yes | - | | primary_keys | array | yes | - | | schema | config | yes | - | ### end_point [string] The endpoint of Tablestore. ### instance_name [string] The intance name of Tablestore. ### access_key_id [string] The access id of Tablestore. ### access_key_secret [string] The access secret of Tablestore. ### table [string] The table name of Tablestore. ### primary_keys [array] The primarky key of table,just add a unique primary key. ### schema [Config] The structure of the data, including field names and field types. For more details, please refer to [Schema Feature](../../introduction/concepts/schema-feature.md). ## Example ```bash env { parallelism = 1 job.mode = "STREAMING" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Tablestore { end_point = "https://****.cn-zhangjiakou.tablestore.aliyuncs.com" instance_name = "****" access_key_id="***************2Ag5" access_key_secret="***********2Dok" table="test" primary_keys=["id"] schema={ fields { id = string name = string } } } } sink { MongoDB{ uri = "mongodb://localhost:27017" database = "test" collection = "test" primary-key = ["id"] schema = { fields { id = string name = string } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/TiDB-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-tidb.md'; # TiDB CDC > TiDB CDC source connector ## Support Those Engines > SeaTunnel Zeta
    > Flink
    ## Key features - [ ] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description The TiDB CDC connector allows for reading snapshot data and incremental data from TiDB database. This document describes how to set up the TiDB CDC connector to snapshot data and capture streaming event in TiDB database. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------|----------------------------------------------------------------------| | MySQL |
  • [MySQL](https://dev.mysql.com/doc): 5.5, 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 | | tikv-client-java | 3.2.0 | - | - | https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0 | ## Using Dependency ### Install Jdbc Driver #### For Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) and the [tikv-client-java jar package](https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. #### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) and the [tikv-client-java jar package](https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0) has been placed in directory `${SEATUNNEL_HOME}/lib/`. Please download and put Mysql driver and tikv-java-client in `${SEATUNNEL_HOME}/lib/` dir. For example: cp mysql-connector-java-xxx.jar `$SEATUNNEL_HOME/lib/` ## Data Type Mapping | Mysql Data Type | SeaTunnel Data Type | |------------------------------------------------------------------------------------------------|---------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | TINYINT | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(p, s)
    DECIMAL(p, s) UNSIGNED
    NUMERIC(p, s)
    NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED
    REAL
    REAL UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    ENUM
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | BINARY
    VARBINAR
    BIT(p)
    TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    GEOMETRY | BYTES | ## Source Options | Name | Type | Required | Default | Description | |-------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:mysql://tidb0:4000/inventory`. | | username | String | Yes | - | Name of the database to use when connecting to the database server. | | password | String | Yes | - | Password to use when connecting to the database server. | | pd-addresses | String | Yes | - | TiKV cluster's PD address | | database-name | String | Yes | - | Database name of the database to monitor. | | table-name | String | Yes | - | Table name of the database to monitor. The table name needs to include the database name. | | startup.mode | Enum | No | INITIAL | Optional startup mode for TiDB CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`.
    `initial`: Synchronize historical data at startup, and then synchronize incremental data.
    `earliest`: Startup from the earliest offset possible.
    `latest`: Startup from the latest offset.
    `specific`: Startup from user-supplied specific offsets. | | batch-size-per-scan | Int | No | 1000 | Size per scan. | | tikv.grpc.timeout_in_ms | Long | No | - | TiKV GRPC timeout in ms. | | tikv.grpc.scan_timeout_in_ms | Long | No | - | TiKV GRPC scan timeout in ms. | | tikv.batch_get_concurrency | Integer | No | - | TiKV GRPC batch get concurrency | | tikv.batch_scan_concurrency | Integer | No | - | TiKV GRPC batch scan concurrency | ## Task Example ### Simple ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** TiDB-CDC { plugin_output = "products_tidb_cdc" url = "jdbc:mysql://tidb0:4000/inventory" driver = "com.mysql.cj.jdbc.Driver" tikv.grpc.timeout_in_ms = 20000 pd-addresses = "pd0:2379" username = "root" password = "" database-name = "inventory" table-name = "products" } } transform { } sink { jdbc { plugin_input = "products_tidb_cdc" url = "jdbc:mysql://tidb0:4000/inventory" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "" database = "inventory" table = "products_sink" generate_sink_sql = true primary_keys = ["id"] } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Typesense.md ================================================ import ChangeLog from '../changelog/connector-typesense.md'; # Typesense > Typesense Source Connector ## Description Reads data from Typesense. ## Key Features - [x] [Batch Processing](../../introduction/concepts/connector-v2-features.md) - [ ] [Stream Processing](../../introduction/concepts/connector-v2-features.md) - [ ] [Exactly-Once](../../introduction/concepts/connector-v2-features.md) - [x] [Schema](../../introduction/concepts/connector-v2-features.md) - [x] [Parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [User-Defined Splits Support](../../introduction/concepts/connector-v2-features.md) ## Options | Name | Type | Required | Default | |------------|--------|----------|---------| | hosts | array | yes | - | | collection | string | yes | - | | schema | config | yes | - | | api_key | string | no | - | | query | string | no | - | | batch_size | int | no | 100 | ### hosts [array] The access address of Typesense, for example: `["typesense-01:8108"]`. ### collection [string] The name of the collection to write to, for example: `"seatunnel"`. ### schema [config] The columns to be read from Typesense. For more information, please refer to the [guide](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported). ### api_key [config] The `api_key` for Typesense security authentication. ### batch_size The number of records to query per batch when reading data. ### Common Options For common parameters of Source plugins, please refer to [Source Common Options](../common-options/source-common-options.md). ## Example ```bash source { Typesense { hosts = ["localhost:8108"] collection = "companies" api_key = "xyz" query = "q=*&filter_by=num_employees:>9000" schema = { fields { company_name_list = array company_name = string num_employees = long country = string id = string c_row = { c_int = int c_string = string c_array_int = array } } } } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Vertica.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Vertica > JDBC Vertica Source Connector ## Description Read external data source data through JDBC. ## Support Those Engines > Spark
    > Flink
    > SeaTunnel Zeta
    ## Using Dependency ### For Spark/Flink Engine > 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/plugins/`. ### For SeaTunnel Zeta Engine > 1. You need to ensure that the [jdbc driver jar package](https://www.vertica.com/download/vertica/client-drivers/) has been placed in directory `${SEATUNNEL_HOME}/lib/`. ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [ ] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [x] [support user-defined split](../../introduction/concepts/connector-v2-features.md) > supports query SQL and can achieve projection effect. ## Supported DataSource Info | Datasource | Supported versions | Driver | Url | Maven | |------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| | Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | ## Data Type Mapping | Vertical Data Type | SeaTunnel Data Type | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT | BOOLEAN | | TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | LONG | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | | GEOMETRY
    UNKNOWN | Not supported yet | ## Source Options | Name | Type | Required | Default | Description | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | | driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
    if you use Vertica the value is `com.vertica.jdbc.Driver`. | | username | String | No | - | Connection instance user name | | password | String | No | - | Connection instance password | | query | String | Yes | - | Query statement | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | | partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | | partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | | partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | | partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | | fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
    the row fetch size used in the query toimprove performance by
    reducing the number database hits required to satisfy the selection criteria.
    Zero means use jdbc default value. | | properties | Map | No | - | Additional connection configuration parameters,when properties and URL have the same parameters, the priority is determined by the
    specific implementation of the driver. For example, in MySQL, properties take precedence over the URL. | | common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](../common-options/source-common-options.md) for details | ### Tips > If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. ## Task Example ### Simple > This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. ``` # Defining the runtime environment env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink { Console {} } ``` ### Parallel > Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table ``` source { Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } ``` ### Parallel Boundary > It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured ``` source { Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } } ``` ## Changelog ================================================ FILE: docs/en/connectors/source/Web3j.md ================================================ import ChangeLog from '../changelog/connector-web3j.md'; # Web3j > Web3j source connector ## Support Those Engines > Spark
    > Flink
    > Seatunnel Zeta
    ## Key Features - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## Description Source connector for web3j. It is used to read data from the blockchain, such as block information, transactions, smart contract events, etc. Currently, it supports reading block height data. ## Source Options | Name | Type | Required | Default | Description | |------|--------|----------|---------|---------------------------------------------------------------------------------------------------------| | url | String | Yes | - | When using Infura as the service provider, the URL is used for communication with the Ethereum network. | ## How to Create a Http Data Synchronization Jobs ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Web3j { url = "https://mainnet.infura.io/v3/xxxxx" } } # Console printing of the read Http data sink { Console { parallelism = 1 } } ``` Then you will get the following data: ```json {"blockNumber":19525949,"timestamp":"2024-03-27T13:28:45.605Z"} ``` ## Changelog ================================================ FILE: docs/en/developer/coding-guide.md ================================================ # Coding Guide This guide documents an overview of the current Apache SeaTunnel modules and best practices on how to submit a high quality pull request to Apache SeaTunnel. ## Modules Overview | Module Name | Introduction | |----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------| | seatunnel-api | SeaTunnel connector V2 API module | | seatunnel-common | SeaTunnel common module | | seatunnel-connectors-v2 | SeaTunnel connector V2 module, currently connector V2 is under development and the community will focus on it | | seatunnel-core/seatunnel-spark-starter | SeaTunnel core starter module of connector V2 on Spark engine | | seatunnel-core/seatunnel-flink-starter | SeaTunnel core starter module of connector V2 on Flink engine | | seatunnel-core/seatunnel-starter | SeaTunnel core starter module of connector V2 on SeaTunnel engine | | seatunnel-e2e | SeaTunnel end-to-end test module | | seatunnel-examples | SeaTunnel local examples module, developer can use it to do unit test and integration test | | seatunnel-engine | SeaTunnel engine module, seatunnel-engine is a new computational engine developed by the SeaTunnel Community that focuses on data synchronization. | | seatunnel-formats | SeaTunnel formats module, used to offer the ability of formatting data | | seatunnel-plugin-discovery | SeaTunnel plugin discovery module, used to offer the ability of loading SPI plugins from classpath | | seatunnel-transforms-v2 | SeaTunnel transform V2 module, currently transform V2 is under development and the community will focus on it | | seatunnel-translation | SeaTunnel translation module, used to adapt Connector V2 and other computing engines such as Spark, Flink etc... | ## How To Submit A High Quality Pull Request 1. Create entity classes using annotations in the `lombok` plugin (`@Data` `@Getter` `@Setter` `@NonNull` etc...) to reduce the amount of code. It's a good practice to prioritize the use of lombok plugins in your coding process. 2. If you need to use log4j to print logs in a class, preferably use the annotation `@Slf4j` in the `lombok` plugin. 3. SeaTunnel uses issue to track logical issues, including bugs and improvements, and uses Github's pull requests to manage the review and merge of specific code changes. So making a clear issue or pull request helps the community better understand the developer's intent. The best practice of creating issue or pull request is as the following shown: > [purpose] [module name] [sub-module name] Description 1. Pull request purpose includes: `Hotfix`, `Feature`, `Improve`, `Docs`, `WIP`. Note that if your pull request's purpose is `WIP`, then you need to use github's draft pull request 2. Issue purpose includes: `Feature`, `Bug`, `Docs`, `Discuss` 3. Module name: the current pull request or issue involves the name of the module, for example: `Core`, `Connector-V2`, `Connector-V1`, etc. 4. Sub-module name: the current pull request or issue involves the name of the sub-module, for example:`File` `Redis` `Hbase` etc. 5. Description: provide a brief, clear summary of the current pull request and issue's main goals and aim for a title that conveys the core purpose at a glance. Tips:**For more details, you can refer to [Issue Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#issue) and [Pull Request Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#pull-request)** 4. Code segments are never repeated. If a code segment is used multiple times, define it multiple times is not a good option, make it a public segment for other modules to use is a best practice. 5. When throwing an exception, throw it along with a hint message and the exception should be smaller in scope. Throwing overly broad exceptions promotes complex error handling code that is more likely to contain security vulnerabilities. For example, if your connector encounters an `IOException` while reading data, a reasonable approach would be to the following: ```java try { // read logic } catch (IOException e) { throw SeaTunnelORCFormatException("This orc file is corrupted, please check it", e); } ``` 6. The Apache project has very strict licensing requirements, so every file in an Apache project should contain a license statement. Check that each new file you add contains the `Apache License Header` before submitting pull request: ```java /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ ``` 7. Apache SeaTunnel uses `Spotless` for code style and formatting checks. You could run the following command and `Spotless` will automatically fix the code style and formatting errors for you: ```shell ./mvnw spotless:apply ``` 8. Before you submit your pull request, make sure the project will compile properly after adding your code, you can use the following commands to package the whole project: ```shell # multi threads compile ./mvnw -T 1C clean package ``` ```shell # single thread compile ./mvnw clean package ``` 9. Before submitting pull request, do a full unit test and integration test locally can better verify the functionality of your code, best practice is to use the `seatunnel-examples` module's ability to self-test to ensure that the multi-engine is running properly and the results are correct. 10. If you submit a pull request with a feature that requires updated documentation, always remember to update the documentation. 11. Submit the pull request of connector type can write e2e test to ensure the robustness and robustness of the code, e2e test should include the full data type, and e2e test as little as possible to initialize the docker image, write the test cases of sink and source together to reduce the loss of resources, while using asynchronous features to ensure the stability of the test. A good example can be found at: [MongodbIT.java](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java) 12. The priority of property permission in the class is set to `private`, and mutability is set to `final`, which can be changed reasonably if special circumstances are encountered. 13. The properties in the class and method parameters prefer to use the base type(int boolean double float...), not recommended to use the wrapper type(Integer Boolean Double Float...), if encounter special circumstances reasonable change. 14. When developing a sink connector you need to be aware that the sink will be serialized, and if some properties cannot be serialized, encapsulate the properties into classes and use the singleton pattern. 15. If there are multiple `if` process judgments in the code flow, try to simplify the flow to multiple ifs instead of if-else-if. 16. Pull request has the characteristic of single responsibility, not allowed to include irrelevant code of the feature in pull request, once this situation deal with their own branch before submitting pull request, otherwise the Apache SeaTunnel community will actively close pull request. 17. Contributors should be responsible for their own pull request. If your pull request contains new features or modifies old features, add test cases or e2e tests to prove the reasonableness and functional integrity of your pull request is a good practice. 18. If you think which part of the community's current code is unreasonable (especially the core `core` module and the `api` module), the function needs to be updated or modified, the first thing to do is to propose a `discuss issue` or `email` with the community to discuss the need to modify this part of the function, if the community agrees to submit pull request again, do not submit the issue and pull request directly without discussion, so the community will directly consider this pull request is useless, and will be closed down. ================================================ FILE: docs/en/developer/contribute-plugin.md ================================================ # Contribute Connector-V2 Plugins If you want to contribute Connector-V2, please click the Connector-V2 Contribution Guide below for reference. It can help you enter development more quickly. [Connector-v2 Contribution Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md) ================================================ FILE: docs/en/developer/contribute-transform-v2-guide.md ================================================ # Contribute Transform-V2 Plugins If you want to contribute Transform-V2, please click the Transform-V2 Contribution Guide below for reference. It can help you enter development more quickly. [Connector-v2 Contribution Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-transforms-v2/README.md) ================================================ FILE: docs/en/developer/docs-format-specification.md ================================================ # Docs Format Specification ## Admonitions We have special admonitions syntax by wrapping text with a set of 3 colons, followed by a label denoting its type. When you want to emphasize the content, it is recommended to use admonitions. In use, the following specifications need to be followed: - Tip: mainly used for operational tips and tricks. - Note: used for more details and explanations. - Caution: used for warnings and precautions. You may also specify an optional title. Here are the examples of admonitions syntax: ```Markdown :::tip Tip Some content with tips ::: :::info Note Some content with explanations ::: :::caution Warning Some content with precuations and warnings ::: ``` ================================================ FILE: docs/en/developer/how-to-create-your-connector.md ================================================ # Develop Your Own Connector If you want to develop your own connector for the new SeaTunnel connector API (Connector V2), please check [here](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md). ## Architecture Reference For detailed information on SeaTunnel's API design and engine architecture, see: - [Architecture Overview](../architecture/overview.md) - Overall architecture and design principles - [Source Architecture](../architecture/api-design/source-architecture.md) - Deep dive into Source API design - [Sink Architecture](../architecture/api-design/sink-architecture.md) - Deep dive into Sink API design - [Translation Layer](../architecture/api-design/translation-layer.md) - How connectors work across different engines - [Checkpoint Mechanism](../architecture/fault-tolerance/checkpoint-mechanism.md) - Fault tolerance and state management These documents will help you understand the underlying architecture and design patterns used in SeaTunnel connectors. ================================================ FILE: docs/en/developer/new-license.md ================================================ # How To Add New License ### ASF 3RD PARTY LICENSE POLICY You have to pay attention to the following open-source software protocols which Apache projects support when you intend to add a new feature to the SeaTunnel (or other Apache projects), which functions refers to other open-source software references. [ASF 3RD PARTY LICENSE POLICY](https://apache.org/legal/resolved.html) If the 3rd party software is not present at the above policy, we wouldn't accept your code. ### How to Legally Use 3rd Party Open-source Software In The SeaTunnel Moreover, when we intend to refer a new software ( not limited to 3rd party jar, text, CSS, js, pics, icons, audios etc and modifications based on 3rd party files) to our project, we need to use them legally in addition to the permission of ASF. Refer to the following article: * [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) For example, we should contain the NOTICE file (most of open-source project has NOTICE file, generally under root directory) of ZooKeeper in our project when we are using ZooKeeper. As the Apache explains, "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work. We are not going to dive into every 3rd party open-source license policy in here, you may look up them if interested. ### SeaTunnel-License Check Rules In general, we would have our License-check scripts to our project. SeaTunnel-License-Check is provided by [SkyWalking](https://github.com/apache/skywalking) which differ a bit from other open-source projects. All in all, we are trying to make sure avoiding the license issues at the first time. We need to follow the following steps when we need to add new jars or external resources: * Add the name and the version of the jar file in the known-dependencies.txt * Add relevant maven repository address under 'seatunnel-dist/release-docs/LICENSE' directory * Append relevant NOTICE files under 'seatunnel-dist/release-docs/NOTICE' directory and make sure they are no different to the original repository * Add relevant source code protocols under 'seatunnel-dist/release-docs/licenses' directory and the file name should be named as license+filename.txt. e.g.: license-zk.txt * check dependency license fail ``` --- /dev/fd/63 2020-12-03 03:08:57.191579482 +0000 +++ /dev/fd/62 2020-12-03 03:08:57.191579482 +0000 @@ -1,0 +2 @@ +HikariCP-java6-2.3.13.jar @@ -16,0 +18 @@ +c3p0-0.9.5.2.jar @@ -149,0 +152 @@ +mchange-commons-java-0.2.11.jar - commons-lang-2.1.3.jar Error: Process completed with exit code 1. ``` Generally speaking, the work of adding a jar is often not so easy to end, because it often depends on various other jars, and we also need to add corresponding licenses for these jars. In this case, we will get the error message of check dependency license fail in check. As above, we are missing the license declaration of `HikariCP-java6-2.3.13`, `c3p0`, etc. (`+` means new, `-` means need to delete ), follow the steps to add jar to add ### References * [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) * [ASF 3RD PARTY LICENSE POLICY](https://apache.org/legal/resolved.html) ================================================ FILE: docs/en/developer/setup.md ================================================ # Set Up Develop Environment In this section, we are going to show you how to set up your development environment for SeaTunnel, and then run a simple example in your JetBrains IntelliJ IDEA. > You can develop or test SeaTunnel code in any development environment that you like, but here we use > [JetBrains IDEA](https://www.jetbrains.com/idea/) as an example to teach you to step by step. ## Prepare Before we start talking about how to set up the environment, we need to do some preparation work. Make sure you already have installed the following software: * [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) installed. * [Java](https://www.java.com/en/download/) ( JDK8/JDK11 are supported by now) installed and `JAVA_HOME` set. * [Scala](https://www.scala-lang.org/download/2.11.12.html) (only scala 2.11.12 supported by now) installed. * [JetBrains IDEA](https://www.jetbrains.com/idea/) installed. ## Set Up ### Clone the Source Code First of all, you need to clone the SeaTunnel source code from [GitHub](https://github.com/apache/seatunnel). ```shell git clone git@github.com:apache/seatunnel.git ``` ### Install Subproject Locally After cloning the source code, you should run the `./mvnw` command to install the subproject to the maven local repository. Otherwise, your code could not start in JetBrains IntelliJ IDEA correctly. ```shell ./mvnw clean install -DskipTests ``` ### Building SeaTunnel From Source After you install the maven, you can use the following command to compile and package. ``` mvn clean package -pl seatunnel-dist -am -Dmaven.test.skip=true ``` ### Building Sub Module If you want to build submodules separately, you can use the following command to compile and package. ```ssh # This is an example of building the redis connector separately mvn clean package -pl seatunnel-connectors-v2/connector-redis -am -DskipTests -T 1C ``` ### Install JetBrains IDEA Scala Plugin Now, you can open your JetBrains IntelliJ IDEA and explore the source code. But before building Scala code in IDEA, you should also install JetBrains IntelliJ IDEA's [Scala Plugin](https://plugins.jetbrains.com/plugin/1347-scala). See [Install Plugins For IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) if you want to. ### Install JetBrains IDEA Lombok Plugin Before running the following example, you should also install JetBrains IntelliJ IDEA's [Lombok plugin](https://plugins.jetbrains.com/plugin/6317-lombok). See [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) if you want to. ### Code Style Apache SeaTunnel uses `Spotless` for code style and format checks. You can run the following command and `Spotless` will automatically fix the code style and formatting errors for you: ```shell ./mvnw spotless:apply ``` You could copy the `pre-commit hook` file `/tools/spotless_check/pre-commit.sh` to your `.git/hooks/` directory so that every time you commit your code with `git commit`, `Spotless` will automatically fix things for you. ## Run Simple Example After all the above things are done, you just finish the environment setup and can run an example we provide to you out of box. All examples are in module `seatunnel-examples`, you could pick one you are interested in, [Running Or Debugging It In IDEA](https://www.jetbrains.com/help/idea/run-debug-configuration.html) as you wish. Here we use `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineLocalExample.java` as an example, when you run it successfully you can see the output as below: ```log 2024-08-10 11:45:32,839 INFO org.apache.seatunnel.core.starter.seatunnel.command.ClientExecuteCommand - *********************************************** Job Statistic Information *********************************************** Start Time : 2024-08-10 11:45:30 End Time : 2024-08-10 11:45:32 Total Time(s) : 2 Total Read Count : 5 Total Write Count : 5 Total Failed Count : 0 *********************************************** ``` ## What's More All our examples use simple source and sink to make it less dependent and easy to run. You can change the example configuration in `resources/examples`. You can change your configuration as below, if you want to use PostgreSQL as the source and sink to console. Please note that when using connectors other than FakeSource and Console, you need to modify the dependencies in the `pom.xml` file of the corresponding submodule of seatunnel-example. ```conf env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://host:port/database" username = postgres password = "123456" query = "select * from test" table_path = "database.test" } } sink { Console {} } ``` ================================================ FILE: docs/en/engines/command/connector-check.md ================================================ # Connector Check Command Usage ## Command Entrypoint ```shell bin/seatunnel-connector.sh ``` ## Options ```text Usage: seatunnel-connector.sh [options] Options: -h, --help Show the usage message -l, --list List all supported plugins(sources, sinks, transforms) (default: false) -o, --option-rule Get option rule of the plugin by the plugin identifier(connector name or transform name) -pt, --plugin-type SeaTunnel plugin type, support [source, sink, transform] ``` ## Example ```shell # List all supported connectors(sources and sinks) and transforms bin/seatunnel-connector.sh -l # List all supported sinks bin/seatunnel-connector.sh -l -pt sink # Get option rule of the connector or transform by the name bin/seatunnel-connector.sh -o Paimon # Get option rule of paimon sink bin/seatunnel-connector.sh -o Paimon -pt sink ``` ================================================ FILE: docs/en/engines/command/usage.mdx ================================================ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # Command Usage ## Command Entrypoint ```bash bin/start-seatunnel-spark-2-connector-v2.sh ``` ```bash bin/start-seatunnel-spark-3-connector-v2.sh ``` ```bash bin/start-seatunnel-flink-13-connector-v2.sh ``` ```bash bin/start-seatunnel-flink-15-connector-v2.sh ``` ## Options ```bash Usage: start-seatunnel-spark-2-connector-v2.sh [options] Options: --check Whether check config (default: false) -c, --config Config file -e, --deploy-mode Spark deploy mode, support [cluster, client] (default: client) -h, --help Show the usage message -m, --master Spark master, support [spark://host:port, mesos://host:port, yarn, k8s://https://host:port, local], default local[*] (default: local[*]) -n, --name SeaTunnel job name (default: SeaTunnel) -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318 (default: []) ``` ```bash Usage: start-seatunnel-spark-3-connector-v2.sh [options] Options: --check Whether check config (default: false) -c, --config Config file -e, --deploy-mode Spark deploy mode, support [cluster, client] (default: client) -h, --help Show the usage message -m, --master Spark master, support [spark://host:port, mesos://host:port, yarn, k8s://https://host:port, local], default local[*] (default: local[*]) -n, --name SeaTunnel job name (default: SeaTunnel) -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318 (default: []) ``` ```bash Usage: start-seatunnel-flink-13-connector-v2.sh [options] Options: --check Whether check config (default: false) -c, --config Config file -e, --deploy-mode Flink job deploy mode, support [run, run-application] (default: run) -h, --help Show the usage message --master, --target Flink job submitted target master, support [local, remote, yarn-session, yarn-per-job, kubernetes-session, yarn-application, kubernetes-application] -n, --name SeaTunnel job name (default: SeaTunnel) -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318 (default: []) ``` ```bash Usage: start-seatunnel-flink-15-connector-v2.sh [options] Options: --check Whether check config (default: false) -c, --config Config file -e, --deploy-mode Flink job deploy mode, support [run, run-application] (default: run) -h, --help Show the usage message --master, --target Flink job submitted target master, support [local, remote, yarn-session, yarn-per-job, kubernetes-session, yarn-application, kubernetes-application] -n, --name SeaTunnel job name (default: SeaTunnel) -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318 (default: []) ``` ## Example ```bash bin/start-seatunnel-spark-2-connector-v2.sh --config config/v2.batch.config.template -m local -e client ``` ```bash bin/start-seatunnel-spark-3-connector-v2.sh --config config/v2.batch.config.template -m local -e client ``` ```bash bin/start-seatunnel-flink-13-connector-v2.sh --config config/v2.batch.config.template ``` ```bash bin/start-seatunnel-flink-15-connector-v2.sh --config config/v2.batch.config.template ``` ================================================ FILE: docs/en/engines/event-listener.md ================================================ # Event Listener ## Introduction The SeaTunnel provides a rich event listening feature that allows you to manage the status at which data is synchronized. This functionality is crucial when you need to listen job running status(`org.apache.seatunnel.api.event`). This document will guide you through the usage of these parameters and how to leverage them effectively. ## Support Those Engines > SeaTunnel Zeta
    > Flink
    > Spark
    ## API The event API is defined in the `org.apache.seatunnel.api.event` package. ### Event Data API - `org.apache.seatunnel.api.event.Event` - The interface for event data. - `org.apache.seatunnel.api.event.EventType` - The enum for event type. #### EventType Enumeration Description The `EventType` enumeration defines all possible event types in the system, mainly including: | Event Type | Description | Associated Event Class | |---------------------------------|---------------------------------|---------------------------------| | `JOB_STATUS` | Job status change event | `JobStateEvent` | | `SCHEMA_CHANGE_UPDATE_COLUMNS` | Table structure update event | `AlterTableColumnsEvent` | | `SCHEMA_CHANGE_ADD_COLUMN` | Table column addition event | `AlterTableAddColumnEvent` | | `SCHEMA_CHANGE_DROP_COLUMN` | Table column deletion event | `AlterTableDropColumnEvent` | | `SCHEMA_CHANGE_MODIFY_COLUMN` | Table column modification event | `AlterTableModifyColumnEvent` | | `READER_OPEN` | Reader open event | `ReaderOpenEvent` | | `READER_CLOSE` | Reader close event | `ReaderCloseEvent` | | `WRITER_OPEN` | Writer open event | `WriterOpenEvent` | | `WRITER_CLOSE` | Writer close event | `WriterCloseEvent` | > Note: Different event types correspond to different event data structures. When customizing an event handler, you need to judge the type through `event.getEventType()` to ensure type-safe conversion. ### Event Listener API You can customize event handler, such as sending events to external systems. - `org.apache.seatunnel.api.event.EventHandler` - The interface for event handler, SPI will automatically load subclass from the classpath. ### Event Collect API - `org.apache.seatunnel.api.source.SourceSplitEnumerator` - Attached event listener API to report events from `SourceSplitEnumerator`. ```java package org.apache.seatunnel.api.source; public interface SourceSplitEnumerator { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this enumerator. * * @return */ EventListener getEventListener(); } } ``` - `org.apache.seatunnel.api.source.SourceReader` - Attached event listener API to report events from `SourceReader`. ```java package org.apache.seatunnel.api.source; public interface SourceReader { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this reader. * * @return */ EventListener getEventListener(); } } ``` - `org.apache.seatunnel.api.sink.SinkWriter` - Attached event listener API to report events from `SinkWriter`. ```java package org.apache.seatunnel.api.sink; public interface SinkWriter { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this writer. * * @return */ EventListener getEventListener(); } } ``` ## Configuration Listener To use the event listening feature, you need to configure engine config. ### Zeta Engine Example config in your config file(seatunnel.yaml): ``` seatunnel: engine: event-report-http: url: "http://example.com:1024/event/report" headers: Content-Type: application/json ``` ### Flink Engine You can define the implementation class of `org.apache.seatunnel.api.event.EventHandler` interface and add to the classpath to automatically load it through SPI. Support flink version: 1.14.0+ Example: `org.apache.seatunnel.api.event.LoggingEventHandler` ### Spark Engine You can define the implementation class of `org.apache.seatunnel.api.event.EventHandler` interface and add to the classpath to automatically load it through SPI. ## Steps to Implement a Custom Event Handler The following takes `JobStateEvent` as an example to illustrate how to implement a custom event handler. You can extend this method to handle other types of events as needed. ### 1. Add Dependencies Introduce the necessary dependencies in the project's `pom.xml`: ```xml org.apache.seatunnel seatunnel-api ${seatunnel.version} provided org.apache.seatunnel seatunnel-engine-common ${seatunnel.version} provided ``` > Note: Replace `${seatunnel.version}` with the actual SeaTunnel version used. ### 2. Implement the Event Handler Create a custom class that implements the `org.apache.seatunnel.api.event.EventHandler` interface, override the `handle` method, and implement business logic for the event types to be processed. **Core Logic**: Filter event types through `event.getEventType()` — since the SeaTunnel engine distributes various types of events, you need to explicitly judge the event type to ensure only target events are processed. ```java import lombok.extern.slf4j.Slf4j; import org.apache.seatunnel.api.event.Event; import org.apache.seatunnel.api.event.EventHandler; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.engine.common.job.JobStatus; import org.apache.seatunnel.engine.common.job.JobStateEvent; import org.apache.seatunnel.api.event.schema.AlterTableAddColumnEvent; import org.apache.seatunnel.api.event.source.ReaderOpenEvent; import org.apache.seatunnel.api.event.sink.WriterCloseEvent; /** * Example of a custom multi-type event handler, including processing logic for multiple events */ @Slf4j public class CustomMultiEventHandler implements EventHandler { @Override public void handle(Event event) { // Process differently based on event type EventType eventType = event.getEventType(); switch (eventType) { case JOB_STATUS: handleJobStateEvent((JobStateEvent) event); break; case SCHEMA_CHANGE_ADD_COLUMN: handleAddColumnEvent((AlterTableAddColumnEvent) event); break; case READER_OPEN: handleReaderOpenEvent((ReaderOpenEvent) event); break; case WRITER_CLOSE: handleWriterCloseEvent((WriterCloseEvent) event); break; // Add processing for other event types as needed default: // Ignore unprocessed event types log.debug("Ignoring unprocessed event type: {}", eventType); } } /** * Handle job state events */ private void handleJobStateEvent(JobStateEvent jobEvent) { String jobId = jobEvent.getJobId(); String jobName = jobEvent.getJobName(); JobStatus status = jobEvent.getJobStatus(); long eventTime = jobEvent.getCreatedTime(); switch (status) { case FAILED: log.error("Job failed | jobId: {}, jobName: {}, Time: {}", jobId, jobName, eventTime); // Add failure alert logic sendAlert("Job Failure", "jobId: " + jobId); break; case FINISHED: log.info("Job completed | jobId: {}, jobName: {}, Time: {}", jobId, jobName, eventTime); break; // Handle other statuses... default: log.info("Job status changed | jobId: {}, Status: {}, Time: {}", jobId, status, eventTime); } } /** * Handle table column addition events */ private void handleAddColumnEvent(AlterTableAddColumnEvent event) { log.info("Column added to table | Table Name: {}, Added Columns: {}, Time: {}", event.getTableName(), event.getAddedColumns(), event.getEventTime()); // Handle table structure change logic } /** * Handle reader open events */ private void handleReaderOpenEvent(ReaderOpenEvent event) { log.info("Reader opened | Plugin ID: {}, Parallelism: {}, Time: {}", event.getPluginId(), event.getParallelism(), event.getEventTime()); // Handle reader initialization logic } /** * Handle writer close events */ private void handleWriterCloseEvent(WriterCloseEvent event) { log.info("Writer closed | Plugin ID: {}, Processed Record Count: {}, Time: {}", event.getPluginId(), event.getRecordCount(), event.getEventTime()); // Handle writer resource cleanup logic } /** * Send alert notifications */ private void sendAlert(String title, String content) { // Implement alert logic (e.g., calling HTTP APIs, sending emails, etc.) log.info("[Alert] {}: {}", title, content); } } ``` ### 3. Configure SPI Loading To enable the engine to automatically discover and load the custom handler, add an SPI configuration file in the project's resource directory: 1. Create the directory: `src/main/resources/META-INF/services/` 2. Create a new file: `org.apache.seatunnel.api.event.EventHandler` 3. Add the fully qualified class name of the custom handler to the file: ``` com.example.CustomMultiEventHandler ``` ### 4. Deployment and Verification - Place the JAR package containing the custom handler into the SeaTunnel engine's classpath (e.g., the `lib/` directory) - After starting the task, when the corresponding event occurs, the handler will be triggered automatically and execute the corresponding processing logic - Verify whether the handler works properly through log output ### Notes - The handler logic should be as lightweight as possible to avoid blocking the event processing thread - If network calls are required (e.g., sending alerts), it is recommended to implement them in an asynchronous manner to prevent timeouts from affecting the task itself - Different engines may have different levels of support for events; for example, `JobStateEvent` currently only supports the Zeta engine - Event types and event classes are in a one-to-one correspondence; ensure type matching during conversion to avoid `ClassCastException` - You can implement multiple event handlers to process different types of events respectively, or handle multiple event types in a single handler Through the above steps, you can flexibly monitor and process various events in SeaTunnel, and implement custom business logic such as status monitoring, alert notifications, and data statistics. ================================================ FILE: docs/en/engines/flink.md ================================================ # Seatunnel Runs On Flink Flink is a powerful high-performance distributed stream processing engine. More information about it you can search for `Apache Flink` ### Set Flink Configuration Information In The Job Begin with `flink.` Example: I set a precise Checkpoint for this job ``` env { parallelism = 1 flink.execution.checkpointing.unaligned.enabled=true } ``` Enumeration types are not currently supported, you need to specify them in the Flink conf file ,Only these types of Settings are supported for the time being:
    Integer/Boolean/String/Duration ### How To Set Up A Simple Flink Job This is a simple job that runs on Flink. Randomly generated data is printed to the console ``` env { # common parameter parallelism = 1 checkpoint.interval = 5000 # flink special parameter flink.execution.checkpointing.mode = "EXACTLY_ONCE" flink.execution.checkpointing.timeout = 600000 } source { FakeSource { row.num = 16 plugin_output = "fake_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(33, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(33, 18)" c_timestamp = timestamp } } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink{ Console{} } ``` ### How To Run A Job In A Project After you pull the code to the local, go to the `seatunnel-examples/seatunnel-flink-connector-v2-example` module and find `org.apache.seatunnel.example.flink.v2.SeaTunnelApiExample` to complete the operation of the job. ================================================ FILE: docs/en/engines/overview.md ================================================ --- sidebar_position: 1 --- # Engine Overview SeaTunnel supports multiple execution engines, allowing you to choose the best one for your use case. This document provides a comprehensive comparison to help you make the right choice. ## Supported Engines | Engine | Description | Recommended For | |--------|-------------|-----------------| | **SeaTunnel Engine (Zeta)** | Native engine built specifically for data integration | New projects, data synchronization | | **Apache Flink** | Distributed stream processing engine | Existing Flink infrastructure | | **Apache Spark** | Distributed batch/stream processing engine | Existing Spark infrastructure | ## Quick Comparison ### Feature Comparison | Feature | SeaTunnel Engine | Flink | Spark | |---------|------------------|-------|-------| | **Batch Processing** | ✅ | ✅ | ✅ | | **Stream Processing** | ✅ | ✅ | ✅ | | **CDC Support** | ✅ | ✅ | ❌ | | **Exactly-Once** | ✅ | ✅ | ✅ | | **Multi-Table Sync** | ✅ | ✅ | ✅ | | **Schema Evolution** | ✅ | ✅ | ❌ | | **REST API** | ✅ | ✅ | ❌ | | **Web UI** | ✅ | ✅ | ✅ | | **Standalone Mode** | ✅ | ✅ | ✅ | | **Cluster Mode** | ✅ | ✅ | ✅ | ### Performance Comparison | Metric | SeaTunnel Engine | Flink | Spark | |--------|------------------|-------|-------| | **Throughput** | ⭐⭐⭐ High | ⭐⭐ Medium | ⭐⭐ Medium | | **Latency** | ⭐⭐⭐ Low | ⭐⭐⭐ Low | ⭐⭐ Medium | | **Resource Usage** | ⭐⭐⭐ Low | ⭐⭐ Medium | ⭐ High | | **Startup Time** | ⭐⭐⭐ Fast | ⭐⭐ Medium | ⭐ Slow | ### Ease of Use | Aspect | SeaTunnel Engine | Flink | Spark | |--------|------------------|-------|-------| | **Installation** | ⭐⭐⭐ Simple | ⭐⭐ Medium | ⭐⭐ Medium | | **Configuration** | ⭐⭐⭐ Simple | ⭐⭐ Medium | ⭐⭐ Medium | | **Dependencies** | ⭐⭐⭐ None | ⭐⭐ Zookeeper (optional) | ⭐ YARN/Mesos | | **Learning Curve** | ⭐⭐⭐ Easy | ⭐⭐ Medium | ⭐⭐ Medium | ## When to Use Each Engine ### SeaTunnel Engine (Zeta) - Recommended **Best for:** - New data integration projects - Data synchronization and CDC scenarios - Users without existing big data infrastructure - Scenarios requiring low resource consumption - Real-time synchronization of many small tables **Advantages:** - No external dependencies (no Zookeeper, HDFS required) - Optimized for data synchronization scenarios - Dynamic thread sharing for efficient resource usage - Pipeline-level fault tolerance - Built-in cluster management and HA - JDBC connection multiplexing **Example use cases:** - MySQL to ClickHouse real-time sync - Multi-table CDC synchronization - Database migration projects ### Apache Flink **Best for:** - Organizations with existing Flink infrastructure - Complex stream processing requirements - Scenarios requiring Flink ecosystem integration **Advantages:** - Mature stream processing capabilities - Rich ecosystem and community - Advanced state management - Integration with Flink SQL **Example use cases:** - Integration with existing Flink pipelines - Complex event processing - Scenarios requiring Flink-specific features ### Apache Spark **Best for:** - Organizations with existing Spark infrastructure - Large-scale batch processing - Integration with Spark ecosystem (MLlib, GraphX) **Advantages:** - Mature batch processing capabilities - Rich ecosystem - Integration with Hive, HDFS - Support for YARN, Kubernetes **Example use cases:** - Large-scale ETL jobs - Integration with existing Spark workflows - Batch data warehouse loading ## Decision Flowchart ``` Start │ ▼ Do you have existing Flink/Spark infrastructure? │ ├─ Yes ──► Do you want to reuse it? │ │ │ ├─ Yes (Flink) ──► Use Flink Engine │ │ │ ├─ Yes (Spark) ──► Use Spark Engine │ │ │ └─ No ──► Use SeaTunnel Engine │ └─ No ──► Use SeaTunnel Engine (Recommended) ``` ## Configuration Examples ### SeaTunnel Engine ```hocon env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 10000 } ``` ### Flink Engine ```hocon env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 10000 flink.execution.checkpointing.mode = "EXACTLY_ONCE" flink.execution.checkpointing.timeout = 600000 } ``` ### Spark Engine ```hocon env { parallelism = 2 job.mode = "BATCH" spark.app.name = "SeaTunnel-Job" spark.executor.memory = "2g" spark.executor.instances = "2" } ``` ## Connector Compatibility All SeaTunnel V2 connectors are compatible with all three engines. However, some features may have different behaviors: | Connector Feature | SeaTunnel Engine | Flink | Spark | |-------------------|------------------|-------|-------| | CDC Connectors | ✅ Full support | ✅ Full support | ❌ Not supported | | Exactly-once sink | ✅ Full support | ✅ Full support | ✅ Partial support | | Multi-table read | ✅ Full support | ✅ Full support | ✅ Full support | ## Migration Guide ### From Flink to SeaTunnel Engine 1. Remove Flink-specific configurations (prefixed with `flink.`) 2. Keep common configurations (`parallelism`, `checkpoint.interval`) 3. Test with SeaTunnel Engine ### From Spark to SeaTunnel Engine 1. Remove Spark-specific configurations (prefixed with `spark.`) 2. Keep common configurations (`parallelism`, `job.mode`) 3. Test with SeaTunnel Engine ## Summary | Scenario | Recommended Engine | |----------|-------------------| | New project without big data infrastructure | **SeaTunnel Engine** | | CDC and real-time synchronization | **SeaTunnel Engine** | | Existing Flink infrastructure | **Flink** | | Existing Spark infrastructure | **Spark** | | Low resource environment | **SeaTunnel Engine** | | Complex stream processing | **Flink** | | Large-scale batch ETL | **Spark** | ## Next Steps - [SeaTunnel Engine Quick Start](zeta/about.md) - [Flink Engine Guide](flink.md) - [Spark Engine Guide](spark.md) ================================================ FILE: docs/en/engines/spark.md ================================================ # SeaTunnel Runs On Spark Spark is a powerful high-performance distributed calculate processing engine. More information about it you can search for `Apache Spark` ### Set Spark Configuration Information In The Job Example: I set some spark conf for this job ``` env { spark.app.name = "example" spark.sql.catalogImplementation = "hive" spark.executor.memory= "2g" spark.executor.instances = "2" spark.yarn.priority = "100' hive.exec.dynamic.partition.mode = "nonstrict" spark.dynamicAllocation.enabled="false" } ``` ### Command Line Example #### Spark on Yarn Cluster ``` ./bin/start-seatunnel-spark-3-connector-v2.sh --master yarn --deploy-mode cluster --config config/example.conf ``` #### Spark on Yarn Cluster ``` ./bin/start-seatunnel-spark-3-connector-v2.sh --master yarn --deploy-mode client --config config/example.conf ``` ### How To Set Up A Simple Spark Job This is a simple job that runs on Spark. Randomly generated data is printed to the console ``` env { # common parameter parallelism = 1 # spark special parameter spark.app.name = "example" spark.sql.catalogImplementation = "hive" spark.executor.memory= "2g" spark.executor.instances = "1" spark.yarn.priority = "100" hive.exec.dynamic.partition.mode = "nonstrict" spark.dynamicAllocation.enabled="false" } source { FakeSource { schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform-v2/sql } sink{ Console{} } ``` ### How To Run A Job In A Project After you pull the code to the local, go to the `seatunnel-examples/seatunnel-spark-connector-v2-example` module and find `org.apache.seatunnel.example.spark.v2.SeaTunnelApiExample` to complete the operation of the job. ================================================ FILE: docs/en/engines/zeta/about.md ================================================ --- sidebar_position: 1 --- # SeaTunnel Engine SeaTunnel Engine is a community-developed data synchronization engine designed for data synchronization scenarios debuts. As the default engine of SeaTunnel, it supports high-throughput, low-latency, and strong-consistent synchronous job operation, which is faster, more stable, more resource-saving, and easy to use. The overall design of the SeaTunnel Engine follows the path below: - Faster, SeaTunnel Engine’s execution plan optimizer aims to reduce data network transmission, thereby reducing the loss of overall synchronization performance caused by data serialization and de-serialization, allowing users to complete data synchronization operations faster. At the same time, a speed limit is supported to synchronize data at a reasonable speed. - More stable, SeaTunnel Engine uses Pipeline as the minimum granularity of checkpoint and fault tolerance for data synchronization tasks. The failure of a task will only affect its upstream and downstream tasks, which avoids task failures that cause the entire job to fail or rollback. At the same time, SeaTunnel Engine also supports data cache for scenarios where the source data has a storage time limit. When the cache is enabled, the data read from the source will be automatically cached, then read by the downstream task and written to the target. Under this condition, even if the data cannot be written due to the failure of the target, it will not affect the regular reading of the source, preventing the data from the source is deleted when expired. - Space-saving, SeaTunnel Engine uses Dynamic Thread Sharing technology internally. In the real-time synchronization scenario, for the tables with a large amount but small data sizes per table, SeaTunnel Engine will run these synchronization tasks in shared threads to reduce unnecessary thread creation and save system space. On the reading and data writing side, the design goal of SeaTunnel Engine is to minimize the amount of JDBC connections; in CDC scenarios, SeaTunnel Engine will reuse log reading and parsing resources. - Simple and easy to use, SeaTunnel Engine reduces the dependence on third-party services and can implement cluster management, snapshot storage, and cluster HA functions independently of big data components such as Zookeeper and HDFS. This is very useful for users who currently lack a big data platform, or are unwilling to rely on a big data platform for data synchronization. In the future, SeaTunnel Engine will further optimize its functions to support full synchronization and incremental synchronization of offline batch synchronization, real-time synchronization, and CDC. ### Cluster Management - Support standalone operation; - Support cluster operation; - Support autonomous cluster (decentralized), which saves the users from specifying a master node for the SeaTunnel Engine cluster, because it can select a master node by itself during operation, and a new master node will be chosen automatically when the master node fails. - Autonomous Cluster nodes-discovery and nodes with the same cluster_name will automatically form a cluster. ### Core functions - Support running jobs in local mode, and the cluster is automatically destroyed after the job once completed; - Support running jobs in cluster mode (single machine or cluster), submitting jobs to the SeaTunnel Engine service through the SeaTunnel client, and the service continues to run after the job is completed and waits for the next job submission; - Support offline batch synchronization; - Support real-time synchronization; - Batch-stream integration, all SeaTunnel V2 connectors can run in SeaTunnel Engine; - Support distributed snapshot algorithm, and supports two-stage submission with SeaTunnel V2 connector, ensuring that data is executed only once. - Support job invocation at the pipeline level to ensure that it can be started even when resources are limited; - Support fault tolerance for jobs at the Pipeline level. Task failure only affects the pipeline where it is located, and only the task under the Pipeline needs to be rolled back; - Support dynamic thread sharing to synchronize a large number of small data sets in real-time. ### Quick Start https://seatunnel.apache.org/docs/start-v2/locally/quick-start-seatunnel-engine ### Download & Install [Download & Install](download-seatunnel.md) ================================================ FILE: docs/en/engines/zeta/checkpoint-storage.md ================================================ --- sidebar_position: 7 --- # Checkpoint Storage ## Introduction Checkpoint is a fault-tolerant recovery mechanism. This mechanism ensures that when the program is running, it can recover itself even if it suddenly encounters an exception. ### Checkpoint Storage Checkpoint Storage is a storage mechanism for storing checkpoint data. SeaTunnel Engine supports the following checkpoint storage types: - HDFS (OSS,COS,S3,HDFS,LocalFile) - LocalFile (native), (it's deprecated: use Hdfs(LocalFile) instead. We use the microkernel design pattern to separate the checkpoint storage module from the engine. This allows users to implement their own checkpoint storage modules. `checkpoint-storage-api` is the checkpoint storage module API, which defines the interface of the checkpoint storage module. If you want to implement your own checkpoint storage module, you need to implement the `CheckpointStorage` and provide the corresponding `CheckpointStorageFactory` implementation. ### Checkpoint Storage Configuration The configuration of the `seatunnel-server` module is in the `seatunnel.yaml` file. ```yaml seatunnel: engine: checkpoint: storage: type: hdfs # plugin name of checkpoint storage, we support hdfs(S3, local, hdfs), localfile (native local file) is the default, but this plugin is deprecated # plugin configuration plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ K1: V1 # plugin other configuration K2: V2 # plugin other configuration ``` Notice: namespace must end with "/". #### OSS Aliyun OSS based hdfs-file you can refer [Hadoop OSS Docs](https://hadoop.apache.org/docs/stable/hadoop-aliyun/tools/hadoop-aliyun/index.html) to config oss. Except when interacting with oss buckets, the oss client needs the credentials needed to interact with buckets. The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider may also be used. If you used AliyunCredentialsProvider (can be obtained from the Aliyun Access Key Management), these consist of an access key, a secret key. You can config like this: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: oss oss.bucket: your-bucket fs.oss.accessKeyId: your-access-key fs.oss.accessKeySecret: your-secret-key fs.oss.endpoint: endpoint address ``` For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). For Aliyun OSS Credential Provider implements, you can see: [Auth Credential Providers](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) #### COS Tencent COS based hdfs-file you can refer [Hadoop COS Docs](https://hadoop.apache.org/docs/stable/hadoop-cos/cloud-storage/) to config COS. Except when interacting with cos buckets, the cos client needs the credentials needed to interact with buckets. The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of com.qcloud.cos.auth.COSCredentialsProvider may also be used. If you used SimpleCredentialsProvider (can be obtained from the Tencent Cloud API Key Management), these consist of an access key, a secret key. You can config like this: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: cos cos.bucket: cosn://your-bucket fs.cosn.credentials.provider: org.apache.hadoop.fs.cosn.auth.SimpleCredentialsProvider fs.cosn.userinfo.secretId: your-secretId fs.cosn.userinfo.secretKey: your-secretKey fs.cosn.bucket.region: your-region ``` For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). For additional COS configuration, you can see: [Tencent Hadoop-COS Docs](https://doc.fincloud.tencent.cn/tcloud/Storage/COS/846365/hadoop) Please add the following jar to the lib directory: - [hadoop-cos-3.4.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-cos/3.4.1) - [cos_api-bundle-5.6.69.jar](https://mvnrepository.com/artifact/com.qcloud/cos_api-bundle/5.6.69) - [hadoop-shaded-guava-1.1.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop.thirdparty/hadoop-shaded-guava/1.1.1) #### S3 S3 based hdfs-file you can refer [hadoop s3 docs](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) to config s3. Except when interacting with public S3 buckets, the S3A client needs the credentials needed to interact with buckets. The client supports multiple authentication mechanisms and can be configured as to which mechanisms to use, and their order of use. Custom implementations of com.amazonaws.auth.AWSCredentialsProvider may also be used. If you used SimpleAWSCredentialsProvider (can be obtained from the Amazon Security Token Service), these consist of an access key, a secret key. You can config like this: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: s3 s3.bucket: your-bucket fs.s3a.access.key: your-access-key fs.s3a.secret.key: your-secret-key fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ``` If you used `InstanceProfileCredentialsProvider`, which supports use of instance profile credentials if running in an EC2 VM, you can check [iam-roles-for-amazon-ec2](https://docs.aws.amazon.com/zh_cn/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html). You can config like this: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: s3 s3.bucket: your-bucket fs.s3a.endpoint: your-endpoint fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.InstanceProfileCredentialsProvider ``` If you want to use Minio that supports the S3 protocol as checkpoint storage, you should configure it this way: ```yaml seatunnel: engine: checkpoint: interval: 10000 timeout: 60000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: s3 fs.s3a.access.key: xxxxxxxxx # Access Key of MinIO fs.s3a.secret.key: xxxxxxxxxxxxxxxxxxxxx # Secret Key of MinIO fs.s3a.endpoint: http://127.0.0.1:9000 # Minio HTTP service access address s3.bucket: s3a://test # test is the bucket name which storage the checkpoint file fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider # important: The user of this key needs to have write permission for the bucket, otherwise an exception of 403 will be returned ``` For additional reading on the Hadoop Credential Provider API, you can see: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). #### HDFS if you use HDFS, you can config like this: ```yaml seatunnel: engine: checkpoint: storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 // if you used kerberos, you can config like this: kerberosPrincipal: your-kerberos-principal kerberosKeytabFilePath: your-kerberos-keytab // if you need hdfs-site config, you can config like this: hdfs_site_path: /path/to/your/hdfs_site_path ``` if HDFS is in HA mode , you can config like this: ```yaml seatunnel: engine: checkpoint: storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: hdfs://usdp-bing seatunnel.hadoop.dfs.nameservices: usdp-bing seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2 seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020 seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020 seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider ``` if HDFS has some other configs in `hdfs-site.xml` or `core-site.xml` , just set HDFS config by using `seatunnel.hadoop.` prefix. #### LocalFile ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: file:/// # Ensure that the directory has written permission ``` ### Enable cache When storage:type is hdfs, cache is disabled by default. If you want to enable it, set `disable.cache: false` ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: hdfs disable.cache: false fs.defaultFS: hdfs:/// ``` or ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # checkpoint storage parent path, the default value is /seatunnel/checkpoint/ storage.type: hdfs disable.cache: false fs.defaultFS: file:/// ``` ================================================ FILE: docs/en/engines/zeta/deployment.md ================================================ --- sidebar_position: 3 --- # SeaTunnel Engine(Zeta) Deployment SeaTunnel Engine(Zeta) supports three different deployment modes: local mode, hybrid cluster mode, and separated cluster mode. Each deployment mode has different usage scenarios, advantages, and disadvantages. You should choose a deployment mode according to your needs and environment. **Local mode:** Only used for testing, each task will start an independent process, and the process will exit after the task is completed. **Hybrid cluster mode:** The Master service and Worker service of SeaTunnel Engine are mixed in the same process. All nodes can run jobs and participate in the election to become the master, that is, the master node is also running synchronous tasks simultaneously. In this mode, Imap (saving the state information of the task to provide support for the fault tolerance of the task) data will be distributed among all nodes. **Separated cluster mode(experimental feature):** The Master service and Worker service of SeaTunnel Engine are separated, and each service is a single process. The Master node is only responsible for job scheduling, rest api, task submission, etc., and Imap data is only stored in the Master node. The Worker node is only responsible for the execution of the task, does not participate in the election to become the master, and does not store Imap data. **Usage suggestion:** Although [Separated Cluster Mode](separated-cluster-deployment.md) is an experimental feature, the first recommended usage will be made in the future. In the hybrid cluster mode, the Master node needs to run tasks synchronously. When the task scale is large, it will affect the stability of the Master node. Once the Master node crashes or the heartbeat times out, it will lead to the switch of the Master node, and the switch of the Master node will cause fault tolerance of all running tasks, which will further increase the load of the cluster. Therefore, we recommend using the separated mode more. [Local Mode Deployment](local-mode-deployment.md) [Hybrid Cluster Mode Deployment](hybrid-cluster-deployment.md) [Separated Cluster Mode Deployment](separated-cluster-deployment.md) ================================================ FILE: docs/en/engines/zeta/download-seatunnel.md ================================================ --- sidebar_position: 2 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # Download And Make Installation Packages ## Step 1: Preparation Before starting to download SeaTunnel, you need to ensure that you have installed the following software required by SeaTunnel: * Install [Java](https://www.java.com/en/download/) (Java 8 or 11, and other versions higher than Java 8 can theoretically work) and set `JAVA_HOME`. ## Step 2: Download SeaTunnel Go to the [Seatunnel Download Page](https://seatunnel.apache.org/download) to download the latest version of the release version installation package `seatunnel--bin.tar.gz`. Or you can also download it through the terminal. ```shell export version="3.0.0" wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` ## Step 3: Download The Connector Plugin Starting from the 2.2.0-beta version, the binary package no longer provides the connector dependency by default. Therefore, when using it for the first time, you need to execute the following command to install the connector: (Of course, you can also manually download the connector from the [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/), and then move it to the `connectors/seatunnel` directory). ```bash sh bin/install-plugin.sh ``` If you need a specific connector version, taking 3.0.0 as an example, you need to execute the following command. ```bash sh bin/install-plugin.sh 3.0.0 ``` Usually you don't need all the connector plugins, so you can specify the plugins you need through configuring `config/plugin_config`, for example, if you only need the `connector-console` plugin, then you can modify the plugin.properties configuration file as follows. ```plugin_config --seatunnel-connectors-- connector-console --end-- ``` If you want the example application to work properly, you need to add the following plugins. ```plugin_config --seatunnel-connectors-- connector-fake connector-console --end-- ``` You can find all supported connectors and the corresponding plugin_config configuration names under `${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`. :::tip Tip If you want to install connector plugins by manually downloading connectors, you only need to download the connector plugins you need and place them in the `${SEATUNNEL_HOME}/connectors/` directory ::: Now you have completed the download of the SeaTunnel installation package and the download of the connector plugin. Next, you can choose different running modes according to your needs to run or deploy SeaTunnel. If you use the SeaTunnel Engine (Zeta) that comes with SeaTunnel to run tasks, you need to deploy the SeaTunnel Engine service first. Refer to [Deployment Of SeaTunnel Engine (Zeta) Service](deployment.md). ================================================ FILE: docs/en/engines/zeta/engine-jar-storage-mode.md ================================================ --- sidebar_position: 9 --- # Config Engine Jar Storage Mode :::caution warn Please note that this feature is currently in an experimental stage, and there are many areas that still need improvement. Therefore, we recommend exercising caution when using this feature to avoid potential issues and unnecessary risks. We are committed to ongoing efforts to enhance and stabilize this functionality, ensuring a better experience for you. ::: We can enable the optimization job submission process, which is configured in the `seatunel.yaml`. After enabling the optimization of the Seatunnel job submission process configuration item, users can use the Seatunnel engine(Zeta) as the execution engine without placing the connector jar packages required for task execution or the third-party jar packages that the connector relies on in each engine `connector` directory. Users only need to place all the jar packages for task execution on the client that submits the job, and the client will automatically upload the jars required for task execution to the Zeta engine. It is necessary to enable this configuration item when submitting jobs in Docker or k8s mode, which can fundamentally solve the problem of large container images caused by the heavy weight of the Seatunnel Zeta engine. In the image, only the core framework package of the Zeta engine needs to be provided, and then the jar package of the connector and the third-party jar package that the connector relies on can be separately uploaded to the pod for distribution. After enabling the optimization job submission process configuration item, you do not need to place the following two types of jar packages in the Zeta engine: - COMMON_PLUGIN_JARS - CONNECTOR_PLUGIN_JARS COMMON_ PLUGIN_ JARS refers to the third-party jar package that the connector relies on, CONNECTOR_ PLUGIN_ JARS refers to the connector jar package. When common jars do not exist in Zeta's `lib`, it can upload the local common jars of the client to the `lib` directory of all engine nodes. This way, even if the user does not place a jar on all nodes in Zeta's `lib`, the task can still be executed normally. However, we do not recommend relying on the configuration item of opening the optimization job submission process to upload the third-party jar package that the connector relies on. If you use Zeta Engine, please add the third-party jar package files that the connector relies on to `$SEATUNNEL_HOME/lib/` directory on each node, such as jdbc drivers. # ConnectorJar Storage Strategy You can configure the storage strategy of the current connector jar package and the third-party jar package that the connector depends on through the configuration file. There are two storage strategies that can be configured, namely shared jar package storage strategy and isolated jar package storage strategy. Two different storage strategies provide a more flexible storage mode for jar files. You can configure the storage strategy to share the same jar package file with multiple execution jobs in the engine. ## Related Configuration | Parameter | Default Value | Describe | |-------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------| | connector-jar-storage-enable | false | Whether to enable uploading the connector jar package to the engine. The default enabled state is false. | | connector-jar-storage-mode | SHARED | Engine-side jar package storage mode selection. There are two optional modes, SHARED and ISOLATED. The default Jar package storage mode is SHARED. | | connector-jar-storage-path | " " | User-defined jar package storage path. | | connector-jar-cleanup-task-interval | 3600s | Engine-side jar package cleaning scheduled task execution interval. | | connector-jar-expiry-time | 600s | Engine-side jar package storage expiration time. | ## IsolatedConnectorJarStorageStrategy Before the job is submitted, the connector Jjr package will be uploaded to an independent file storage path on the Master node. The connector jar packages of different jobs are in different storage paths, so the connector jar packages of different jobs are isolated from each other. The jar package files required for the execution of a job have no influence on other jobs. When the current job execution ends, the jar package file in the storage path generated based on the JobId will be deleted. Example: ```yaml jar-storage: connector-jar-storage-enable: true connector-jar-storage-mode: ISOLATED connector-jar-storage-path: "" connector-jar-cleanup-task-interval: 3600 connector-jar-expiry-time: 600 ``` Detailed explanation of configuration parameters: - connector-jar-storage-enable: Enable uploading the connector jar package before executing the job. - connector-jar-storage-mode: Connector jar package storage mode, two storage modes are available: shared mode (SHARED) and isolation mode (ISOLATED). - connector-jar-storage-path: The local storage path of the user-defined connector jar package on the Zeta engine. - connector-jar-cleanup-task-interval: Zeta engine connector jar package scheduled cleanup task interval, the default is 3600 seconds. - connector-jar-expiry-time: The expiration time of the connector jar package. The default is 600 seconds. ## SharedConnectorJarStorageStrategy Before the job is submitted, the connector jar package will be uploaded to the Master node. Different jobs can share connector jars on the Master node if they use the same Jar package file. All jar package files are persisted to a shared file storage path, and jar packages that reference the Master node can be shared between different jobs. After the task execution is completed, the SharedConnectorJarStorageStrategy will not immediately delete all jar packages related to the current task execution,but instead has an independent thread responsible for cleaning up the work. The configuration in the following configuration file sets the running time of the cleaning work and the survival time of the jar package. Example: ```yaml jar-storage: connector-jar-storage-enable: true connector-jar-storage-mode: SHARED connector-jar-storage-path: "" connector-jar-cleanup-task-interval: 3600 connector-jar-expiry-time: 600 ``` Detailed explanation of configuration parameters: - connector-jar-storage-enable: Enable uploading the connector jar package before executing the job. - connector-jar-storage-mode: Connector jar package storage mode, two storage modes are available: shared mode (SHARED) and isolation mode (ISOLATED). - connector-jar-storage-path: The local storage path of the user-defined connector jar package on the Zeta engine. - connector-jar-cleanup-task-interval: Zeta engine connector Jjr package scheduled cleanup task interval, the default is 3600 seconds. - connector-jar-expiry-time: The expiration time of the connector jar package. The default is 600 seconds. ================================================ FILE: docs/en/engines/zeta/hybrid-cluster-deployment.md ================================================ --- sidebar_position: 5 --- # Deploy SeaTunnel Engine Hybrid Mode Cluster The Master service and Worker service of SeaTunnel Engine are mixed in the same process, and all nodes can run jobs and participate in the election to become master. The master node is also running synchronous tasks simultaneously. In this mode, the Imap (which saves the status information of the task to provide support for the task's fault tolerance) data will be distributed across all nodes. Usage Recommendation: It is recommended to use the [Separated Cluster Mode](separated-cluster-deployment.md). In the hybrid cluster mode, the Master node needs to run tasks synchronously. When the task scale is large, it will affect the stability of the Master node. Once the Master node crashes or the heartbeat times out, it will cause the Master node to switch, and the Master node switch will cause all running tasks to perform fault tolerance, further increasing the load on the cluster. Therefore, we recommend using the [Separated Cluster Mode](separated-cluster-deployment.md). ## 1. Download [Download And Create The SeaTunnel Installation Package](download-seatunnel.md) ## 2. Configure SEATUNNEL_HOME You can configure `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ## 3. Configure The JVM Options For The SeaTunnel Engine The SeaTunnel Engine supports two methods for setting JVM options: 1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_options`. Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_options` file. 2. Add JVM options when starting the SeaTunnel Engine. For example, `seatunnel-cluster.sh -DJvmOption="-Xms2G -Xmx2G"` ## 4. Configure The SeaTunnel Engine The SeaTunnel Engine provides many functions that need to be configured in the `seatunnel.yaml` file. ### 4.1 Backup Count Setting For Data In Imap The SeaTunnel Engine implements cluster management based on [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/). The cluster's status data (job running status, resource status) is stored in the [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map). The data stored in the Hazelcast IMap is distributed and stored on all nodes in the cluster. Hazelcast partitions the data stored in the Imap. Each partition can specify the number of backups. Therefore, the SeaTunnel Engine can implement cluster HA without using other services (such as Zookeeper). `backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members. We recommend that the value of `backup count` be `max(1, min(5, N/2))`. `N` is the number of cluster nodes. ```yaml seatunnel: engine: backup-count: 1 # Other configurations ``` ### 4.2 Slot Configuration The number of slots determines the number of task groups that the cluster node can run in parallel. The formula for the number of slots required for a task is N = 2 + P (the parallelism configured by the task). By default, the number of slots in the SeaTunnel Engine is dynamic, that is, there is no limit on the number. We recommend that the number of slots be set to twice the number of CPU cores on the node, it's a default value when `dynamic-slot` is set to false and not set `slot-num`. Configuration of dynamic slot number (default): ```yaml seatunnel: engine: slot-service: dynamic-slot: true # Other configurations ``` Configuration of static slot number: ```yaml seatunnel: engine: slot-service: dynamic-slot: false slot-num: 20 ``` ### 4.3 Checkpoint Manager Like Flink, the SeaTunnel Engine supports the Chandy–Lamport algorithm. Therefore, it is possible to achieve data synchronization without data loss and duplication. **interval** The interval between two checkpoints, in milliseconds. If the `checkpoint.interval` parameter is configured in the job configuration file's `env`, the one set in the job configuration file will be used. **timeout** The timeout for checkpoints. If the checkpoint cannot be completed within the timeout, a checkpoint failure will be triggered and the job will fail. If the `checkpoint.timeout` parameter is configured in the job configuration file's `env`, the one set in the job configuration file will be used. **min-pause** The minimum pause (in milliseconds) between consecutive checkpoints. This ensures that checkpoints are not triggered too frequently. Example ```yaml seatunnel: engine: backup-count: 1 print-execution-info-interval: 10 slot-service: dynamic-slot: true checkpoint: interval: 300000 timeout: 10000 min-pause: 5000 ``` **checkpoint storage** Checkpoints are a fault-tolerant recovery mechanism. This mechanism ensures that the program can recover on its own even if an exception occurs suddenly during operation. Checkpoints are triggered at regular intervals. Each time a checkpoint is performed, each task is required to report its own status information (such as which offset was read when reading from Kafka) to the checkpoint thread, which writes it to a distributed storage (or shared storage). When a task fails and is automatically fault-tolerant and restored, or when a previously suspended task is restored using the seatunnel.sh -r command, the status information of the corresponding job will be loaded from the checkpoint storage and the job will be restored based on this status information. If the cluster has more than one node, the checkpoint storage must be a distributed storage or shared storage so that the task status information in the storage can be loaded on another node in case of a node failure. For information about checkpoint storage, you can refer to [Checkpoint Storage](checkpoint-storage.md) ### 4.4 Expiration Configuration For Historical Jobs The information of each completed job, such as status, counters, and error logs, is stored in the IMap object. As the number of running jobs increases, the memory usage will increase, and eventually, the memory will overflow. Therefore, you can adjust the `history-job-expire-minutes` parameter to address this issue. The time unit for this parameter is minutes. The default value is 1440 minutes, which is one day. Example ```yaml seatunnel: engine: history-job-expire-minutes: 1440 ``` ### 4.5 Class Loader Cache Mode This configuration primarily addresses the issue of resource leakage caused by constantly creating and attempting to destroy the class loader. If you encounter exceptions related to metaspace overflow, you can try enabling this configuration. To reduce the frequency of class loader creation, after enabling this configuration, SeaTunnel will not attempt to release the corresponding class loader when a job is completed, allowing it to be used by subsequent jobs. This is more effective when the number of Source/Sink connectors used in the running job is not excessive. The default value is true. Example ```yaml seatunnel: engine: classloader-cache-mode: true ``` ### 4.6 Job Scheduling Strategy When resources are insufficient, the job scheduling strategy can be configured in the following two modes: 1. `WAIT`: Wait for resources to be available. 2. `REJECT`: Reject the job, default value. Example ```yaml seatunnel: engine: job-schedule-strategy: WAIT ``` When `dynamic-slot: true` is used, the `job-schedule-strategy: WAIT` configuration will become invalid and will be forcibly changed to `job-schedule-strategy: REJECT`, because this parameter is meaningless in dynamic slots. ### 4.7 Coordinator Service CoordinatorService responsible for the process of generating each job from a LogicalDag to an ExecutionDag, and then to a PhysicalDag. It ultimately creates the JobMaster for the job to handle scheduling, execution, and state monitoring. **core-thread-num** The corePoolSize of seatunnel coordinator job's executor cached thread pool **max-thread-num** The max job count can be executed at same time Example ```yaml coordinator-service: core-thread-num: 30 max-thread-num: 1000 ``` ### 4.8 Job Metrics Partition Count (This parameter is invalid on the Worker node) A new configuration option JOB_METRICS_PARTITION_COUNT controls the number of partitions used to store running job metrics in Hazelcast IMap. - Default: 1 (single key, backward compatible) - Usage: Increase this value to distribute metrics across multiple partitions and reduce contention when many tasks update metrics concurrently. Example: ```yaml seatunnel: engine: job-metrics-partition-count: 4 ``` This will distribute metrics across 4 partitions instead of using a single key. Increasing the partition count provides significant benefits when the number of tasks exceeds approximately 20,000. As a practical guideline, a partition count of around 1,000–2,000 tends to offer the best balance between reducing lock contention and minimizing overhead. It is recommended to start with this value and then adjust based on your cluster size and workload characteristics. Note: Increasing the partition count may improve concurrency under heavy contention, but setting it too high can introduce additional overhead in distribution and merging, which can reduce overall performance. The partition count should be configured before starting a job. Changing the partition count after a job has started may result in metric key mismatches, so it is recommended to restart Seatunnel after modifying this option. ## 5. Configure The SeaTunnel Engine Network Service All SeaTunnel Engine network-related configurations are in the `hazelcast.yaml` file. ### 5.1 Cluster Name The SeaTunnel Engine node uses the `cluster-name` to determine if another node is in the same cluster as itself. If the cluster names of the two nodes are different, the SeaTunnel Engine will reject the service request. ### 5.2 Network Based on [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), a SeaTunnel Engine cluster is a network composed of cluster members running the SeaTunnel Engine server. Cluster members automatically join together to form a cluster. This automatic joining occurs through various discovery mechanisms used by cluster members to detect each other. Please note that once the cluster is formed, communication between cluster members always occurs via TCP/IP, regardless of the discovery mechanism used. The SeaTunnel Engine utilizes the following discovery mechanisms: #### TCP You can configure the SeaTunnel Engine as a full TCP/IP cluster. For detailed configuration information, please refer to the [Discovering Members by TCP section](tcp.md). An example `hazelcast.yaml` file is as follows: ```yaml hazelcast: cluster-name: seatunnel network: join: tcp-ip: enabled: true member-list: - hostname1 port: auto-increment: false port: 5801 properties: hazelcast.logging.type: log4j2 ``` TCP is the recommended method for use in a standalone SeaTunnel Engine cluster. Alternatively, Hazelcast provides several other service discovery methods. For more details, please refer to [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) ### 5.3 IMap Persistence Configuration In SeaTunnel, we use IMap (a distributed Map that enables the writing and reading of data across nodes and processes. For more information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) to store the status of each task and task, allowing us to recover tasks and achieve task fault tolerance in the event of a node failure. By default, the information in Imap is only stored in memory. We can set the replica count for Imap data. For more details, please refer to (4.1 Backup count setting for data in Imap). If the replica count is set to 2, it means that each data will be stored in two different nodes simultaneously. In the event of a node failure, the data in Imap will be automatically replenished to the set replica count on other nodes. However, when all nodes are stopped, the data in Imap will be lost. When the cluster nodes are restarted, all previously running tasks will be marked as failed, and users will need to manually resume them using the seatunnel.sh -r command. To address this issue, we can persist the data in Imap to an external storage such as HDFS or OSS. This way, even if all nodes are stopped, the data in Imap will not be lost. When the cluster nodes are restarted, all previously running tasks will be automatically restored. The following describes how to use the MapStore persistence configuration. For more details, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) **type** The type of IMap persistence, currently only supporting `hdfs`. **namespace** It is used to distinguish the storage location of different business data, such as the name of an OSS bucket. **clusterName** This parameter is mainly used for cluster isolation, allowing you to distinguish between different clusters, such as cluster1 and cluster2, and can also be used to distinguish different business data. **fs.defaultFS** We use the hdfs api to read and write files, so providing the hdfs configuration is required for using this storage. If using HDFS, you can configure it as follows: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 ``` If there is no HDFS and the cluster has only one node, you can configure it to use local files as follows: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: file:/// ``` If using OSS, you can configure it as follows: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: oss block.size: block size(bytes) oss.bucket: oss://bucket name/ fs.oss.accessKeyId: OSS access key id fs.oss.accessKeySecret: OSS access key secret fs.oss.endpoint: OSS endpoint ``` Notice: When using OSS, make sure that the following jars are in the lib directory. ``` aliyun-sdk-oss-3.13.2.jar hadoop-aliyun-3.3.6.jar jdom2-2.0.6.jar netty-buffer-4.1.89.Final.jar netty-common-4.1.89.Final.jar seatunnel-hadoop3-3.1.4-uber.jar ``` It is possible to utilize S3 for IMAP storage. The S3 configuration properties follow the Hadoop S3A filesystem (Native S3) standard. Specifically, we utilize the fs.s3a.access.key and fs.s3a.secret.key properties to ensure compatibility with existing Hadoop-based ecosystems. If you would like to use S3 compatible storage such as Minio, you can configure it like this: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /seatunnel/engine clusterName: seatunnel storage.type: s3 s3.bucket: s3a://your-bucket fs.defaultFS: s3a://your-bucket fs.s3a.endpoint: http://your-minio-endpoint:port fs.s3a.path.style.access: true fs.s3a.access.key: YOUR_ACCESS_KEY fs.s3a.secret.key: YOUR_SECRET_KEY fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ``` Notice: When using S3, make sure that the following jars are in the lib directory. ``` seatunnel-hadoop3-3.1.4-uber.jar seatunnel-hadoop-aws.jar ``` ## 6. Configure The SeaTunnel Engine Client All SeaTunnel Engine client configurations are in the `hazelcast-client.yaml`. ### 6.1 cluster-name The client must have the same `cluster-name` as the SeaTunnel Engine. Otherwise, the SeaTunnel Engine will reject the client's request. ### 6.2 network **cluster-members** You need to add the addresses of all SeaTunnel Engine server nodes here. ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - hostname1:5801 ``` ## 7. Start The SeaTunnel Engine Server Node It can be started with the `-d` parameter through the daemon. ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d ``` The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-server.log` ## 8. Submit And Manage Jobs ### 8.1 Submit Jobs With The SeaTunnel Engine Client #### Install The SeaTunnel Engine Client You only need to copy the `$SEATUNNEL_HOME` directory on the SeaTunnel Engine node to the client node and configure `SEATUNNEL_HOME` in the same way as the SeaTunnel Engine server node. #### Submitting And Managing Jobs Now that the cluster is deployed, you can complete the submission and management of jobs through the following tutorials: [Submit And Manage Jobs](user-command.md) ### 8.2 Submit Jobs With The REST API The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API V2](rest-api-v2.md) ================================================ FILE: docs/en/engines/zeta/local-mode-deployment.md ================================================ --- sidebar_position: 4 --- # Run Jobs In Local Mode In local mode, each task will start a separate process, and the process will exit when the task is completed. There are the following limitations in this mode: 1. Pausing and resuming tasks are not supported. 2. Viewing the task list is not supported. 3. Jobs cannot be cancelled via commands, only by killing the process. However, each task is controlled by a separate process, and there will be no mutual impact between tasks. It is suitable for scenarios with strong requirements for task stability. ## Deploying SeaTunnel Engine In Local Mode In local mode, there is no need to deploy a SeaTunnel Engine cluster. You only need to use the following command to submit jobs. The system will start the SeaTunnel Engine (Zeta) service in the process that submitted the job to run the submitted job, and the process will exit after the job is completed. In this mode, you only need to copy the downloaded and created installation package to the server where you need to run it. If you need to adjust the JVM parameters for job execution, you can modify the `$SEATUNNEL_HOME/config/jvm_client_options` file. ## Submitting Jobs ```shell $SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local ``` ### Configure The JVM Options For Local Mode Local Mode supports two methods for setting JVM options: 1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_client_options`. Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_client_options` file. Please note that the JVM parameters in this file will be applied to all jobs submitted using `seatunnel.sh`, including Local Mode and Cluster Mode. 2. Add JVM options when starting the Local Mode. For example, `$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local -DJvmOption="-Xms2G -Xmx2G"` ## Job Operations Jobs submitted in local mode will run in the process that submitted the job, and the process will exit when the job is completed. If you want to abort the job, you only need to exit the process that submitted the job. The job's runtime logs will be output to the standard output of the process that submitted the job. Other operation and maintenance operations are not supported. ================================================ FILE: docs/en/engines/zeta/logging.md ================================================ --- sidebar_position: 14 --- # Logging All SeaTunnel Engine processes create a log text file that contains messages for various events happening in that process. These logs provide deep insights into the inner workings of SeaTunnel Engine, and can be used to detect problems (in the form of WARN/ERROR messages) and can help in debugging them. The logging in SeaTunnel Engine uses the SLF4J logging interface. This allows you to use any logging framework that supports SLF4J, without having to modify the SeaTunnel Engine source code. By default, Log4j 2 is used as the underlying logging framework. ## Structured logging SeaTunnel Engine adds the following fields to MDC of most of the relevant log messages (experimental feature): - Job ID - key: ST-JID - format: string This is most useful in environments with structured logging and allows you to quickly filter the relevant logs. The MDC is propagated by slf4j to the logging backend which usually adds it to the log records automatically (e.g. in log4j json layout). Alternatively, it can be configured explicitly - log4j pattern layout might look like this: ```properties [%X{ST-JID}] %c{0} %m%n. ``` ## Configuring Log4j2 Log4j 2 is controlled using property files. The SeaTunnel Engine distribution ships with the following log4j properties files in the `config` directory, which are used automatically if Log4j 2 is enabled: - `log4j2_client.properties`: used by the command line client (e.g., `seatunnel.sh`) - `log4j2.properties`: used for SeaTunnel Engine server processes (e.g., `seatunnel-cluster.sh`) By default, log files are output to the `logs` directory. Log4j periodically scans this file for changes and adjusts the logging behavior if necessary. By default this check happens every 60 seconds and is controlled by the monitorInterval setting in the Log4j properties files. ### Configure to output separate log files for jobs To output separate log files for each job, you can update the following configuration in the `log4j2.properties` file: ```properties ... rootLogger.appenderRef.file.ref = routingAppender ... appender.file.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n ... ``` This configuration generates separate log files for each job, for example: ``` job-xxx1.log job-xxx2.log job-xxx3.log ... ``` ### Configuring output mixed logs *This configuration mode by default.* To all job logs output into SeaTunnel Engine system log file, you can update the following configuration in the `log4j2.properties` file: ```properties ... rootLogger.appenderRef.file.ref = fileAppender ... appender.file.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n ... ``` ### Compatibility with Log4j1/Logback SeaTunnel Engine automatically integrates Log framework bridge, allowing existing applications that work against Log4j1/Logback classes to continue working. ### Query Logs via REST API SeaTunnel provides an API for querying logs. **Usage examples:** - Retrieve logs for all nodes with `jobId` of `733584788375666689`: `http://localhost:8080/logs/733584788375666689` - Retrieve the log list for all nodes: `http://localhost:8080/logs` - Retrieve the log list for all nodes in JSON format: `http://localhost:8080/logs?format=json` - Retrieve log file content: `http://localhost:8080/logs/job-898380162133917698.log` For more details, please refer to the [REST-API](rest-api-v2.md). ## SeaTunnel Log Configuration ### Scheduled deletion of old logs SeaTunnel supports scheduled deletion of old log files to prevent disk space exhaustion. You can add the following configuration in the `seatunnel.yml` file: ```yaml seatunnel: engine: history-job-expire-minutes: 1440 telemetry: logs: scheduled-deletion-enable: true ``` - `history-job-expire-minutes`: Sets the retention time for historical job data and logs (in minutes). The system will automatically clear expired job information and log files after the specified period. - `scheduled-deletion-enable`: Enable scheduled cleanup, with default value of `true`. The system will automatically delete relevant log files when job expiration time, as defined by `history-job-expire-minutes`, is reached. If this feature is disabled, logs will remain permanently on disk, requiring manual management, which may affect disk space usage. It is recommended to configure this setting based on specific needs. ## Best practices for developers You can create an SLF4J logger by calling `org.slf4j.LoggerFactory#LoggerFactory.getLogger` with the Class of your class as an argument. Of course, you can also use `lombok` annotation `@Slf4j` to achieve the same effect. ```java import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TestConnector { private static final Logger LOG = LoggerFactory.getLogger(TestConnector.class); public static void main(String[] args) { LOG.info("Hello world!"); } } ``` In order to benefit most from SLF4J, it is recommended to use its placeholder mechanism. Using placeholders allows avoiding unnecessary string constructions in case that the logging level is set so high that the message would not be logged. The syntax of placeholders is the following: ```java LOG.info("This message contains {} placeholders. {}", 1, "key1"); ``` Placeholders can also be used in conjunction with exceptions which shall be logged. ```java try { // some code } catch (Exception e) { LOG.error("An {} occurred", "error", e); } ``` ================================================ FILE: docs/en/engines/zeta/resource-isolation.md ================================================ --- sidebar_position: 9 --- # Resource Isolation SeaTunnel can add `tag` to each worker node, when you submit job you can use `tag_filter` to filter the node you want run this job. ## Configuration 1. update the config in `hazelcast.yaml`, ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 member-attributes: group: type: string value: platform team: type: string value: team1 ``` In this config, we specify the tag by `member-attributes`, the node has `group=platform, team=team1` tags. 2. add `tag_filter` to your job config ```hacon env { parallelism = 1 job.mode = "BATCH" tag_filter { group = "platform" team = "team1" } } source { FakeSource { plugin_output = "fake" parallelism = 1 schema = { fields { name = "string" } } } } transform { } sink { console { plugin_input="fake" } } ``` **Notice:** - If not set `tag_filter` in job config, it will random choose the node in all active nodes. - When you add multiple tag in `tag_filter`, it need all key exist and value match. if all node not match, you will get `NoEnoughResourceException` exception. ![img.png](../../../images/resource-isolation.png) 3. update running node tags by rest api (optional) for more information, please refer to [Update the tags of running node](rest-api-v2.md) ================================================ FILE: docs/en/engines/zeta/rest-api-v1.md ================================================ # RESTful API V1 :::caution warn It is recommended to use the v2 version of the Rest API. The v1 version is deprecated and will be removed in the future. We already disabled the v1 version by default. If you want to use the v1 version, you need to enable it in the `hazelcast.yaml` file. ::: SeaTunnel has a monitoring API that can be used to query status and statistics of running jobs, as well as recent completed jobs. The monitoring API is a RESTful API that accepts HTTP requests and responds with JSON data. ## Overview The monitoring API is backed by a web server that runs as part of the node, each node member can provide RESTful api capability. By default, the server disables the RESTful API V1, and it can be enabled by setting the `rest-api.enabled` configuration in the `hazelcast.yaml` file. This server listens at port 5801, which can be configured in hazelcast.yaml like : ```yaml network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: true port-count: 100 port: 5801 ``` ## API reference ### Returns an overview over the Zeta engine cluster.
    GET /hazelcast/rest/maps/overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) #### Parameters > | name | type | data type | description | > |----------|----------|-----------|------------------------------------------------------------------------------------------------------| > | tag_name | optional | string | the tags filter, you can add tag filter to get those matched worker count, and slot on those workers | #### Responses ```json { "projectVersion":"2.3.10-SNAPSHOT", "gitCommitAbbrev":"DeadD0d0", "totalSlot":"0", "unassignedSlot":"0", "works":"1", "runningJobs":"0", "finishedJobs":"0", "failedJobs":"0", "cancelledJobs":"0" } ``` **Notes:** - If you use `dynamic-slot`, the `totalSlot` and `unassignedSlot` always be `0`. when you set it to fix slot number, it will return the correct total and unassigned slot number - If the url has tag filter, the `works`, `totalSlot` and `unassignedSlot` will return the result on the matched worker. but the job related metric will always return the cluster level information.
    ------------------------------------------------------------------------------------------ ### Returns thread dump information for the current node.
    GET /hazelcast/rest/maps/thread-dump (Returns thread dump information for the current node.) #### Parameters #### Responses ```json [ { "threadName": "", "threadId": 0, "threadState": "", "stackTrace": "" } ] ```
    ------------------------------------------------------------------------------------------ ### Returns An Overview And State Of All Jobs
    GET /hazelcast/rest/maps/running-jobs (Returns an overview over all jobs and their current state.) #### Parameters #### Responses ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "envOptions": { }, "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" } } ] ```
    ------------------------------------------------------------------------------------------ ### Return Details Of A Job
    GET /hazelcast/rest/maps/job-info/:jobId (Return details of a job. ) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-------------| > | jobId | required | long | job id | #### Responses ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. `finishedTime`, `errorMsg` will return when job is finished. #### Metrics field description | Field | Description | | --- | --- | | SourceReceivedCount | Total rows received from sources | | SourceReceivedQPS | Source receive rate (rows/s) | | SourceReceivedBytes | Total bytes received from sources | | SourceReceivedBytesPerSeconds | Source receive rate (bytes/s) | | SinkWriteCount | Sink write attempts (rows) | | SinkWriteQPS | Sink write attempt rate (rows/s) | | SinkWriteBytes | Sink write attempts (bytes) | | SinkWriteBytesPerSeconds | Sink write attempt rate (bytes/s) | | SinkCommittedCount | Sink committed rows after checkpoint succeeds | | SinkCommittedQPS | Sink committed rate (rows/s) | | SinkCommittedBytes | Sink committed bytes after checkpoint succeeds | | SinkCommittedBytesPerSeconds | Sink committed rate (bytes/s) | | TableSourceReceived* | Per-table source metrics, key format `TableSourceReceivedXXX#` | | TableSinkWrite* | Per-table sink write attempts, key format `TableSinkWriteXXX#
    ` | | TableSinkCommitted* | Per-table sink committed metrics, key format `TableSinkCommittedXXX#
    ` | When we can't get the job info, the response will be: ```json { "jobId" : "" } ``` ------------------------------------------------------------------------------------------ ### Return Details Of A Job This API has been deprecated, please use /hazelcast/rest/maps/job-info/:jobId instead
    GET /hazelcast/rest/maps/running-job/:jobId (Return details of a job. ) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-------------| > | jobId | required | long | job id | #### Responses ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "SourceReceivedCount": "", "SourceReceivedQPS": "", "SourceReceivedBytes": "", "SourceReceivedBytesPerSeconds": "", "SinkWriteCount": "", "SinkWriteQPS": "", "SinkWriteBytes": "", "SinkWriteBytesPerSeconds": "", "SinkCommittedCount": "", "SinkCommittedQPS": "", "SinkCommittedBytes": "", "SinkCommittedBytesPerSeconds": "", "TableSourceReceivedCount": {}, "TableSourceReceivedBytes": {}, "TableSourceReceivedBytesPerSeconds": {}, "TableSourceReceivedQPS": {}, "TableSinkWriteCount": {}, "TableSinkWriteQPS": {}, "TableSinkWriteBytes": {}, "TableSinkWriteBytesPerSeconds": {}, "TableSinkCommittedCount": {}, "TableSinkCommittedQPS": {}, "TableSinkCommittedBytes": {}, "TableSinkCommittedBytesPerSeconds": {} }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. `finishedTime`, `errorMsg` will return when job is finished. When we can't get the job info, the response will be: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### Return All Finished Jobs Info
    GET /hazelcast/rest/maps/finished-jobs/:state (Return all finished Jobs Info.) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-----------------------------------------------------------------------------------| > | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`SAVEPOINT_DONE`,`UNKNOWABLE` | #### Responses ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "errorMsg": null, "createTime": "", "finishTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": "" } ] ```
    ------------------------------------------------------------------------------------------ ### Returns System Monitoring Information
    GET /hazelcast/rest/maps/system-monitoring-information (Returns system monitoring information.) #### Parameters #### Responses ```json [ { "isMaster": "true", "host": "localhost", "port": "5801", "processors":"8", "physical.memory.total":"16.0G", "physical.memory.free":"16.3M", "swap.space.total":"0", "swap.space.free":"0", "heap.memory.used":"135.7M", "heap.memory.free":"440.8M", "heap.memory.total":"576.5M", "heap.memory.max":"3.6G", "heap.memory.used/total":"23.54%", "heap.memory.used/max":"3.73%", "minor.gc.count":"6", "minor.gc.time":"110ms", "major.gc.count":"2", "major.gc.time":"73ms", "load.process":"24.78%", "load.system":"60.00%", "load.systemAverage":"2.07", "thread.count":"117", "thread.peakCount":"118", "cluster.timeDiff":"0", "event.q.size":"0", "executor.q.async.size":"0", "executor.q.client.size":"0", "executor.q.client.query.size":"0", "executor.q.client.blocking.size":"0", "executor.q.query.size":"0", "executor.q.scheduled.size":"0", "executor.q.io.size":"0", "executor.q.system.size":"0", "executor.q.operations.size":"0", "executor.q.priorityOperation.size":"0", "operations.completed.count":"10", "executor.q.mapLoad.size":"0", "executor.q.mapLoadAllKeys.size":"0", "executor.q.cluster.size":"0", "executor.q.response.size":"0", "operations.running.count":"0", "operations.pending.invocations.percentage":"0.00%", "operations.pending.invocations.count":"0", "proxy.count":"8", "clientEndpoint.count":"0", "connection.active.count":"2", "client.connection.count":"0", "connection.count":"0" } ] ```
    ------------------------------------------------------------------------------------------ ### Submit A Job
    POST /hazelcast/rest/maps/submit-job (Returns jobId and jobName if job submitted successfully.) #### Parameters > | name | type | data type | description | > |----------------------|----------|-----------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### Body ```json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ``` #### Responses ```json { "jobId": 733584788375666689, "jobName": "rest_api_test" } ```
    ------------------------------------------------------------------------------------------ ### Batch Submit Jobs
    POST /hazelcast/rest/maps/submit-jobs (Returns jobId and jobName if the job is successfully submitted.) #### Parameters (add in the `params` field in the request body) > | Parameter Name | Required | Type | Description | > |----------------------|--------------|---------|---------------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if the job is started with save point | #### Request Body ```json [ { "params":{ "jobId":"123456", "jobName":"SeaTunnel-01" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] }, { "params":{ "jobId":"1234567", "jobName":"SeaTunnel-02" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ] ``` #### Response ```json [ { "jobId": "123456", "jobName": "SeaTunnel-01" },{ "jobId": "1234567", "jobName": "SeaTunnel-02" } ] ```
    ------------------------------------------------------------------------------------------ ### Stop A Job
    POST /hazelcast/rest/maps/stop-job (Returns jobId if job stopped successfully.) #### Parameters > | name | required | data type | description | > |---------------------|----------|-----------|------------------------------------------------------------------| > | jobId | yes | long | job id | > | isStopWithSavePoint | no | boolean | If the job is stopped with a savepoint. | > | force | no | boolean | If true, the job is force-stopped (ignores isStopWithSavePoint). | #### Body ```json { "jobId": 733584788375666689, "isStopWithSavePoint": false, "force": false } ``` #### Responses ```json { "jobId": 733584788375666689 } ``` **Notes:** - If the job status is `DOING_SAVEPOINT` and the savepoint does not complete successfully, a forced stop (When the `force` option is enabled) will set the job status to `CANCELED`. - A forced stop may leave checkpoint data incomplete or in an inconsistent state. It should be used only for exceptional or abnormal situations.
    ------------------------------------------------------------------------------------------ ### Batch Stop Jobs
    POST /hazelcast/rest/maps/stop-jobs (Returns jobId if the job is successfully stopped.) #### Request Body ```json [ { "jobId": 881432421482889220, "isStopWithSavePoint": false, "force": false }, { "jobId": 881432456517910529, "isStopWithSavePoint": false, "force": false } ] ``` #### Response ```json [ { "jobId": 881432421482889220 }, { "jobId": 881432456517910529 } ] ```
    ------------------------------------------------------------------------------------------ ### Encrypt Config
    POST /hazelcast/rest/maps/encrypt-config (Returns the encrypted config if config is encrypted successfully.) For more information about customize encryption, please refer to the documentation [config-encryption-decryption](../../introduction/concepts/config-encryption-decryption.md). #### Body ```json { "env": { "parallelism": 1, "shade.identifier":"base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema" : { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "seatunnel", "password": "seatunnel_password", "table-name": "inventory_vwyw0n" } ], "transform": [ ], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "seatunnel", "password": "seatunnel_password" } ] } ``` #### Responses ```json { "env": { "parallelism": 1, "shade.identifier": "base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema": { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", "table-name": "inventory_vwyw0n" } ], "transform": [], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" } ] } ```
    ------------------------------------------------------------------------------------------ ### Update the tags of running node
    POST/hazelcast/rest/maps/update-tagsBecause the update can only target a specific node, the current node's `ip:port` needs to be used for the update(If the update is successful, return a success message) #### update node tags ##### Body If the request parameter is a `Map` object, it indicates that the tags of the current node need to be updated ```json { "tag1": "dev_1", "tag2": "dev_2" } ``` ##### Responses ```json { "status": "success", "message": "update node tags done." } ``` #### remove node tags ##### Body If the parameter is an empty `Map` object, it means that the tags of the current node need to be cleared ```json {} ``` ##### Responses ```json { "status": "success", "message": "update node tags done." } ``` #### Request parameter exception - If the parameter body is empty ##### Responses ```json { "status": "fail", "message": "Request body is empty." } ``` - If the parameter is not a `Map` object ##### Responses ```json { "status": "fail", "message": "Invalid JSON format in request body." } ```
    ------------------------------------------------------------------------------------------ ### Get All Node Log Content
    GET /hazelcast/rest/maps/logs/:jobId (Returns a list of logs.) #### Request Parameters #### Parameters (Add in the `params` field of the request body) > | Parameter Name | Required | Type | Description | > |----------------------|------------|---------|---------------------------------| > | jobId | optional | string | job id | When `jobId` is empty, it returns log information for all nodes; otherwise, it returns the log list of the specified `jobId` across all nodes. #### Response Returns a list of logs and content from the requested nodes. #### Get All Log Files List If you'd like to view the log list first, you can use a `GET` request to retrieve the log list: `http://localhost:5801/hazelcast/rest/maps/logs?format=json` ```json [ { "node": "localhost:5801", "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899485770241277953.log", "logName": "job-899485770241277953.log" }, { "node": "localhost:5801", "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899470314109468673.log", "logName": "job-899470314109468673.log" } ] ``` The supported formats are `json` and `html`, with `html` as the default. #### Examples Retrieve logs for all nodes with the `jobId` of `733584788375666689`: `http://localhost:5801/hazelcast/rest/maps/logs/733584788375666689` Retrieve the log list for all nodes: `http://localhost:5801/hazelcast/rest/maps/logs` Retrieve the log list for all nodes in JSON format: `http://localhost:5801/hazelcast/rest/maps/logs?format=json` Retrieve log file content: `http://localhost:5801/hazelcast/rest/maps/logs/job-898380162133917698.log`
    ### Get Log Content from a Single Node
    GET /hazelcast/rest/maps/log (Returns a list of logs.) #### Response Returns a list of logs from the requested node. #### Examples To get a list of logs from the current node: `http://localhost:5801/hazelcast/rest/maps/log` To get the content of a log file: `http://localhost:5801/hazelcast/rest/maps/log/job-898380162133917698.log`
    ================================================ FILE: docs/en/engines/zeta/rest-api-v2.md ================================================ # RESTful API V2 SeaTunnel has a monitoring API that can be used to query status and statistics of running jobs, as well as recent completed jobs. The monitoring API is a RESTful API that accepts HTTP requests and responds with JSON data. ## Overview The v2 version of the api uses jetty support. It is the same as the interface specification of v1 version , you can specify the port and context-path by modifying the configuration items in `seatunnel.yaml`, you can configure `enable-dynamic-port` to enable dynamic ports (the default port is accumulated starting from `port`), and the default is enabled, If enable-dynamic-port is true, We will use the unused port in the range within the range of `port` and `port` + `port-range`, default range is 100 ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-dynamic-port: true port-range: 100 ``` Context-path can also be configured as follows: ```yaml seatunnel: engine: http: enable-http: true port: 8080 context-path: /seatunnel ``` ## Enable HTTPS Please refer [security](security.md) ## API reference ### Returns an overview over the Zeta engine cluster.
    GET /overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) #### Parameters > | name | type | data type | description | > |----------|----------|-----------|------------------------------------------------------------------------------------------------------| > | tag_name | optional | string | the tags filter, you can add tag filter to get those matched worker count, and slot on those workers | #### Responses ```json { "projectVersion":"2.3.10-SNAPSHOT", "gitCommitAbbrev":"DeadD0d0", "totalSlot":"0", "unassignedSlot":"0", "works":"1", "runningJobs":"0", "pendingJobs":"0", "finishedJobs":"0", "failedJobs":"0", "cancelledJobs":"0" } ``` **Notes:** - If you use `dynamic-slot`, the `totalSlot` and `unassignedSlot` always be `0`. when you set it to fix slot number, it will return the correct total and unassigned slot number - If the url has tag filter, the `works`, `totalSlot` and `unassignedSlot` will return the result on the matched worker. but the job related metric will always return the cluster level information.
    ------------------------------------------------------------------------------------------ ### Query An Overview And State Of Running Jobs
    GET /running-jobs?page=1&rows=10 (Query an overview over running jobs and their current state.) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-----------------------------------------------------------------------------------| > | page | optional | int | page number. | > | rows | optional | int | page size. | #### Responses ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "envOptions": { }, "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" } } ] ```
    ------------------------------------------------------------------------------------------ ### Returns Diagnostic Information For Pending Jobs
    GET /pending-jobs?jobId=123&limit=10 (Inspect the pending queue, slot usage and blocking reasons.) #### Parameters > | name | type | data type | description | > |----------|----------|-----------|-----------------------------------------------------------------------------| > | jobId | optional | long | If set, only returns the diagnostics for the specified job. When both `jobId` and `limit` are provided, `jobId` takes precedence and `limit` is ignored. | > | limit | optional | integer | Limits the number of jobs returned. This parameter is ignored when `jobId` is provided. | > | pretty | optional | boolean | When `true`, pretty-print JSON and format timestamp fields. | #### Responses ```json { "queueSummary": { "size": 2, "scheduleStrategy": "WAIT", "oldestEnqueueTimestamp": 1717500000000, "newestEnqueueTimestamp": 1717500005000, "lackingTaskGroups": 6 }, "clusterSnapshot": { "totalSlots": 8, "freeSlots": 1, "assignedSlots": 7, "workerCount": 2, "workers": [ { "address": "10.0.0.8:5801", "tags": { "zone": "az1" }, "totalSlots": 4, "freeSlots": 0, "dynamicSlot": false, "cpuUsage": 0.83, "memUsage": 0.64, "runningJobIds": [ 1001, 1002 ] } ] }, "pendingJobs": [ { "jobId": 1003, "jobName": "cdc_mysql_to_es", "pendingSourceState": "SUBMIT", "jobStatus": "PENDING", "enqueueTimestamp": 1717500000000, "checkTime": 1717500005000, "waitDurationMs": 5000, "checkCount": 3, "totalTaskGroups": 16, "allocatedTaskGroups": 10, "lackingTaskGroups": 6, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: can't apply resource request", "tagFilter": {}, "blockingJobIds": [ 1001 ], "pipelines": [ { "pipelineId": 1, "pipelineName": "Job job-name, Pipeline: [(1/2)]", "totalTaskGroups": 8, "allocatedTaskGroups": 5, "lackingTaskGroups": 3, "taskGroupDiagnostics": [ { "taskGroupLocation": { "jobId": 1003, "pipelineId": 1, "taskGroupId": 1 }, "taskFullName": "Source[0]", "allocated": false, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: slot not enough" } ] } ], "lackingTaskGroupDiagnostics": [ { "taskGroupLocation": { "jobId": 1003, "pipelineId": 1, "taskGroupId": 1 }, "taskFullName": "Source[0]", "allocated": false, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: slot not enough" } ] } ] } ``` When `pretty=true`, the endpoint returns a pretty-printed JSON response and formats `oldestEnqueueTimestamp`, `newestEnqueueTimestamp`, `enqueueTimestamp`, and `checkTime` as `yyyy-MM-dd HH:mm:ss`. This endpoint helps troubleshoot why jobs stay in `PENDING` by showing the pending queue order, aggregated resource view, and per task-group slot request failures (tag mismatch, worker busy, resource exhausted, etc.). **Pending Jobs Response Fields** - **queueSummary** – overview of the entire pending queue. - `size`: number of jobs currently pending. - `scheduleStrategy`: strategy in use (e.g. `WAIT`, `FAIL_FAST`) that dictates what happens when resources are insufficient. - `oldestEnqueueTimestamp` / `newestEnqueueTimestamp`: timestamps (ms) of the oldest/latest job in the queue. - `lackingTaskGroups`: total TaskGroup count still waiting for slots. **Note**: This value reflects only the jobs included in the current response (i.e., the subset limited by the `limit` parameter or filtered by `jobId`), not the entire pending queue. To view the complete statistics for all pending jobs, call this API without the `limit` parameter. - **clusterSnapshot** – cluster resource snapshot (can be filtered by tags). - `totalSlots` / `assignedSlots` / `freeSlots`: total, allocated and remaining slots in the filtered view. - `workerCount`: number of workers that match the tag filters. - `workers[]`: per-worker details: - `address`: host:port of the worker. - `tags`: worker-level tags. - `totalSlots` / `freeSlots`: slot capacity and available slot count on that worker. - `dynamicSlot`: whether the worker uses dynamic slot allocation. - `cpuUsage` / `memUsage`: sampled system load (only present when `slot-allocate-strategy` is `SYSTEM_LOAD`). - `runningJobIds[]`: jobs currently occupying slots on that worker (helps identify blockers). - **pendingJobs[]** – diagnostics for each pending job. - `jobId` / `jobName`: identifiers. - `pendingSourceState`: whether the job comes from a new submission (`SUBMIT`) or master switch restore (`RESTORE`). - `jobStatus`: status recorded in the physical plan (typically `PENDING`). - `enqueueTimestamp`: when the job entered the pending queue. - `checkTime`: timestamp of the latest diagnostic snapshot. - `waitDurationMs`: `checkTime - enqueueTimestamp`. - `checkCount`: how many times the scheduler has checked this job. - `totalTaskGroups` / `allocatedTaskGroups` / `lackingTaskGroups`: TaskGroup totals vs. assigned vs. lacking. - `failureReason` / `failureMessage`: classified cause (e.g. `RESOURCE_NOT_ENOUGH`, `REQUEST_FAILED`) plus raw message. - `tagFilter`: worker tag requirements declared by the job (if any). - `blockingJobIds[]`: other jobs that currently occupy the required slots. - `pipelines[]`: per-pipeline breakdown. - `pipelineId` / `pipelineName`. - `totalTaskGroups` / `allocatedTaskGroups` / `lackingTaskGroups`. - `taskGroupDiagnostics[]` (per TaskGroup slot request state): - `taskGroupLocation` (`jobId`, `pipelineId`, `taskGroupId`). - `taskFullName`: human-readable name (source/sink, etc.). - `allocated`: whether the slot request succeeded. - `failureReason` / `failureMessage`: task-level cause when allocation failed. - `lackingTaskGroupDiagnostics[]`: flattened list of `allocated=false` TaskGroups for quick review.
    ------------------------------------------------------------------------------------------ ### Return Details Of A Job
    GET /job-info/:jobId (Return details of a job. ) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-------------| > | jobId | required | long | job id | #### Responses ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "IntermediateQueueSize": "", "SourceReceivedCount": "", "SourceReceivedQPS": "", "SourceReceivedBytes": "", "SourceReceivedBytesPerSeconds": "", "SinkWriteCount": "", "SinkWriteQPS": "", "SinkWriteBytes": "", "SinkWriteBytesPerSeconds": "", "SinkCommittedCount": "", "SinkCommittedQPS": "", "SinkCommittedBytes": "", "SinkCommittedBytesPerSeconds": "", "TableSourceReceivedCount": {}, "TableSourceReceivedBytes": {}, "TableSourceReceivedBytesPerSeconds": {}, "TableSourceReceivedQPS": {}, "TableSinkWriteCount": {}, "TableSinkWriteQPS": {}, "TableSinkWriteBytes": {}, "TableSinkWriteBytesPerSeconds": {}, "TableSinkCommittedCount": {}, "TableSinkCommittedQPS": {}, "TableSinkCommittedBytes": {}, "TableSinkCommittedBytesPerSeconds": {} }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. `finishedTime`, `errorMsg` will return when job is finished. #### Metrics field description | Field | Description | | --- | --- | | IntermediateQueueSize | Size of intermediate queue between operators | | SourceReceivedCount | Total rows received from sources | | SourceReceivedQPS | Source receive rate (rows/s) | | SourceReceivedBytes | Total bytes received from sources | | SourceReceivedBytesPerSeconds | Source receive rate (bytes/s) | | SinkWriteCount | Sink write attempts (rows) | | SinkWriteQPS | Sink write attempt rate (rows/s) | | SinkWriteBytes | Sink write attempts (bytes) | | SinkWriteBytesPerSeconds | Sink write attempt rate (bytes/s) | | SinkCommittedCount | Sink committed rows after checkpoint succeeds | | SinkCommittedQPS | Sink committed rate (rows/s) | | SinkCommittedBytes | Sink committed bytes after checkpoint succeeds | | SinkCommittedBytesPerSeconds | Sink committed rate (bytes/s) | | TableSourceReceived* | Per-table source metrics, key format `TableSourceReceivedXXX#
    ` | | TableSinkWrite* | Per-table sink write attempts, key format `TableSinkWriteXXX#
    ` | | TableSinkCommitted* | Per-table sink committed metrics, key format `TableSinkCommittedXXX#
    ` | When we can't get the job info, the response will be: ```json { "jobId" : "" } ``` ------------------------------------------------------------------------------------------ ### Return Details Of A Job This API has been deprecated, please use /job-info/:jobId instead
    GET /running-job/:jobId (Return details of a job. ) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-------------| > | jobId | required | long | job id | #### Responses ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "IntermediateQueueSize": "", "SourceReceivedCount": "", "SourceReceivedQPS": "", "SourceReceivedBytes": "", "SourceReceivedBytesPerSeconds": "", "SinkWriteCount": "", "SinkWriteQPS": "", "SinkWriteBytes": "", "SinkWriteBytesPerSeconds": "", "TableSourceReceivedCount": {}, "TableSourceReceivedBytes": {}, "TableSourceReceivedBytesPerSeconds": {}, "TableSourceReceivedQPS": {}, "TableSinkWriteCount": {}, "TableSinkWriteQPS": {}, "TableSinkWriteBytes": {}, "TableSinkWriteBytesPerSeconds": {} }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` always be returned. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` will return when job is running. `finishedTime`, `errorMsg` will return when job is finished. When we can't get the job info, the response will be: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### Query Finished Jobs Info
    GET /finished-jobs/:state?page=1&rows=10 (Query finished Jobs Info.) #### Parameters > | name | type | data type | description | > |-------|----------|-----------|-----------------------------------------------------------------------------------| > | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`SAVEPOINT_DONE`,`UNKNOWABLE` | > | page | optional | int | page number. | > | rows | optional | int | page size. | #### Responses ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "errorMsg": null, "createTime": "", "finishTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": "" } ] ```
    ------------------------------------------------------------------------------------------ ### Returns System Monitoring Information
    GET /system-monitoring-information (Returns system monitoring information.) #### Parameters #### Responses ```json [ { "processors":"8", "physical.memory.total":"16.0G", "physical.memory.free":"16.3M", "swap.space.total":"0", "swap.space.free":"0", "heap.memory.used":"135.7M", "heap.memory.free":"440.8M", "heap.memory.total":"576.5M", "heap.memory.max":"3.6G", "heap.memory.used/total":"23.54%", "heap.memory.used/max":"3.73%", "minor.gc.count":"6", "minor.gc.time":"110ms", "major.gc.count":"2", "major.gc.time":"73ms", "load.process":"24.78%", "load.system":"60.00%", "load.systemAverage":"2.07", "thread.count":"117", "thread.peakCount":"118", "cluster.timeDiff":"0", "event.q.size":"0", "executor.q.async.size":"0", "executor.q.client.size":"0", "executor.q.client.query.size":"0", "executor.q.client.blocking.size":"0", "executor.q.query.size":"0", "executor.q.scheduled.size":"0", "executor.q.io.size":"0", "executor.q.system.size":"0", "executor.q.operations.size":"0", "executor.q.priorityOperation.size":"0", "operations.completed.count":"10", "executor.q.mapLoad.size":"0", "executor.q.mapLoadAllKeys.size":"0", "executor.q.cluster.size":"0", "executor.q.response.size":"0", "operations.running.count":"0", "operations.pending.invocations.percentage":"0.00%", "operations.pending.invocations.count":"0", "proxy.count":"8", "clientEndpoint.count":"0", "connection.active.count":"2", "client.connection.count":"0", "connection.count":"0" } ] ```
    ------------------------------------------------------------------------------------------ ### Submit A Job
    POST /submit-job (Returns jobId and jobName if job submitted successfully.) #### Parameters > | name | type | data type | description | > |----------------------|----------|-----------|----------------------------------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | > | format | optional | string | config format, support json, hocon and sql, default json | #### Body You can choose json, hocon or sql to pass request body. The json format example: ``` json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ``` The hocon format example: ``` hocon env { job.mode = "batch" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } transform { } sink { Console { plugin_input = "fake" } } ``` The SQL format example: ```sql /* config env { parallelism = 2 job.mode = "BATCH" } */ CREATE TABLE fake_source ( id INT, name STRING, age INT ) WITH ( 'connector' = 'FakeSource', 'rows' = '[ { fields = [1, "Alice", 25], kind = INSERT }, { fields = [2, "Bob", 30], kind = INSERT } ]', 'schema' = '{ fields { id = "int", name = "string", age = "int" } }', 'type' = 'source' ); CREATE TABLE console_sink ( id INT, name STRING, age INT ) WITH ( 'connector' = 'Console', 'type' = 'sink' ); INSERT INTO console_sink SELECT * FROM fake_source; ``` #### Responses ```json { "jobId": 733584788375666689, "jobName": "rest_api_test" } ```
    ------------------------------------------------------------------------------------------ ### Submit A Job By Upload Config File
    POST /submit-job/upload (Returns jobId and jobName if job submitted successfully.) #### Parameters > | name | type | data type | description | > |----------------------|----------|-----------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### Request Body The name of the uploaded file key is config_file, and supports the following formats: - `.json` files: parsed in JSON format - `.conf` or `.config` files: parsed in HOCON format - `.sql` files: parsed in SQL format, supports CREATE TABLE and INSERT INTO syntax curl Example : ```bash # Upload HOCON config file curl --location 'http://127.0.0.1:8080/submit-job/upload' --form 'config_file=@"/temp/fake_to_console.conf"' # Upload SQL config file curl --location 'http://127.0.0.1:8080/submit-job/upload' --form 'config_file=@"/temp/job.sql"' ``` #### Responses ```json { "jobId": 733584788375666689, "jobName": "SeaTunnel_Job" } ```
    ------------------------------------------------------------------------------------------ ### Batch Submit Jobs
    POST /submit-jobs (Returns jobId and jobName if the job is successfully submitted.) #### Parameters (add in the `params` field in the request body) > | Parameter Name | Required | Type | Description | > |----------------------|--------------|---------|---------------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if the job is started with save point | #### Request Body ```json [ { "params":{ "jobId":"123456", "jobName":"SeaTunnel-01" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] }, { "params":{ "jobId":"1234567", "jobName":"SeaTunnel-02" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ] ``` #### Response ```json [ { "jobId": "123456", "jobName": "SeaTunnel-01" },{ "jobId": "1234567", "jobName": "SeaTunnel-02" } ] ```
    ------------------------------------------------------------------------------------------ ### Stop A Job
    POST /stop-job (Returns jobId if job stopped successfully.) #### Parameters > | name | required | data type | description | > |---------------------|----------|-----------|------------------------------------------------------------------| > | jobId | yes | long | job id | > | isStopWithSavePoint | no | boolean | If the job is stopped with a savepoint. | > | force | no | boolean | If true, the job is force-stopped (ignores isStopWithSavePoint). | #### Body ```json { "jobId": 733584788375666689, "isStopWithSavePoint": false, "force": false } ``` #### Responses ```json { "jobId": 733584788375666689 } ``` **Notes:** - If the job status is `DOING_SAVEPOINT` and the savepoint does not complete successfully, a forced stop (When the `force` option is enabled) will set the job status to `CANCELED`. - A forced stop may leave checkpoint data incomplete or in an inconsistent state. It should be used only for exceptional or abnormal situations.
    ------------------------------------------------------------------------------------------ ### Batch Stop Jobs
    POST /stop-jobs (Returns jobId if the job is successfully stopped.) #### Request Body ```json [ { "jobId": 881432421482889220, "isStopWithSavePoint": false, "force": false }, { "jobId": 881432456517910529, "isStopWithSavePoint": false, "force": false } ] ``` #### Response ```json [ { "jobId": 881432421482889220 }, { "jobId": 881432456517910529 } ] ```
    ------------------------------------------------------------------------------------------ ### Encrypt Config
    POST /encrypt-config (Returns the encrypted config if config is encrypted successfully.) For more information about customize encryption, please refer to the documentation [config-encryption-decryption](../../introduction/concepts/config-encryption-decryption.md). #### Body ```json { "env": { "parallelism": 1, "shade.identifier":"base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema" : { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "seatunnel", "password": "seatunnel_password", "table-name": "inventory_vwyw0n" } ], "transform": [ ], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "seatunnel", "password": "seatunnel_password" } ] } ``` #### Responses ```json { "env": { "parallelism": 1, "shade.identifier": "base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema": { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", "table-name": "inventory_vwyw0n" } ], "transform": [], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" } ] } ```
    ------------------------------------------------------------------------------------------ ### Update the tags of running node
    POST/update-tagsBecause the update can only target a specific node, the current node's `ip:port` needs to be used for the update(If the update is successful, return a success message) #### update node tags ##### Body If the request parameter is a `Map` object, it indicates that the tags of the current node need to be updated ```json { "tag1": "dev_1", "tag2": "dev_2" } ``` ##### Responses ```json { "status": "success", "message": "update node tags done." } ``` #### remove node tags ##### Body If the parameter is an empty `Map` object, it means that the tags of the current node need to be cleared ```json {} ``` ##### Responses ```json { "status": "success", "message": "update node tags done." } ``` #### Request parameter exception - If the parameter body is empty ##### Responses ```json { "status": "fail", "message": "Request body is empty." } ``` - If the parameter is not a `Map` object ##### Responses ```json { "status": "fail", "message": "Invalid JSON format in request body." } ```
    ------------------------------------------------------------------------------------------ ### Get Logs from All Nodes
    GET /logs/:jobId (Returns a list of logs.) #### Request Parameters #### Parameters (to be added in the `params` field of the request body) > | Parameter Name | Required | Type | Description | > |-----------------------|--------------|---------|------------------------------------| > | jobId | optional | string | job id | If `jobId` is empty, the request will return logs from all nodes. Otherwise, it will return the list of logs for the specified `jobId` from all nodes. #### Response Returns a list of logs from the requested nodes along with their content. #### Return List of All Log Files If you want to view the log list first, you can retrieve it via a `GET` request: `http://localhost:8080/logs?format=json` ```json [ { "node": "localhost:8080", "logLink": "http://localhost:8080/logs/job-899485770241277953.log", "logName": "job-899485770241277953.log" }, { "node": "localhost:8080", "logLink": "http://localhost:8080/logs/job-899470314109468673.log", "logName": "job-899470314109468673.log" } ] ``` Supported formats are `json` and `html`, with `html` as the default. #### Examples Retrieve logs for `jobId` `733584788375666689` across all nodes: `http://localhost:8080/logs/733584788375666689` Retrieve the list of logs from all nodes: `http://localhost:8080/logs` Retrieve the list of logs in JSON format: `http://localhost:8080/logs?format=json` Retrieve the content of a specific log file: `http://localhost:8080/logs/job-898380162133917698.log`
    ### Get Log Content from a Single Node
    GET /log (Returns a list of logs.) #### Response Returns a list of logs from the requested node. #### Examples To get a list of logs from the current node: `http://localhost:5801/log` To get the content of a log file: `http://localhost:5801/log/job-898380162133917698.log`
    ### Get Node Metrics
    GET /metrics GET /openmetrics To get the metrics, you need to open `Telemetry` first, or you will get an empty response. More information about `Telemetry` can be found in the [Telemetry](telemetry.md) documentation.
    ### Get Job Checkpoint Overview
    GET /jobs/checkpoints/:jobId (Return checkpoint overview of every pipeline). #### Path Parameter - `jobId`: required job identifier. #### Response Example ```json { "jobId": "1234567890", "updatedAt": 1720000000123, "pipelines": [ { "pipelineId": 1, "counts": { "triggered": 10, "completed": 8, "failed": 1, "inProgress": 1, "restored": 2 }, "latestCompleted": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 }, "latestFailed": { "checkpointId": 8, "checkpointType": "CHECKPOINT_TYPE", "status": "FAILED", "triggerTimestamp": 1719999995000, "failureReason": "CHECKPOINT_EXPIRED" }, "latestSavepoint": null, "inProgress": [ { "checkpointId": 10, "checkpointType": "CHECKPOINT_TYPE", "triggerTimestamp": 1720000005000, "acknowledged": 2, "total": 4 } ], "history": [ { "pipelineId": 1, "checkpoint": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 } } ] } ] } ```
    #### Field Description | Field | Description | | --- | --- | | `jobId` | Job ID. | | `updatedAt` | Latest snapshot timestamp (millisecond). | | `pipelines` | List of pipeline statistics. | | `pipelines[].pipelineId` | Pipeline ID. | | `pipelines[].counts.triggered/completed/failed/inProgress/restored` | Checkpoint statistics:
    - `triggered`: total triggered checkpoints.
    - `completed`: total successful checkpoints.
    - `failed`: total failed checkpoints.
    - `inProgress`: checkpoints currently running.
    - `restored`: number of restore (including savepoint) attempts. | | `pipelines[].latestCompleted/latestFailed/latestSavepoint` | Metadata of the latest completed/failed/savepoint checkpoints (see table below for field definitions). | | `pipelines[].inProgress` | Ongoing checkpoints with details:
    - `checkpointId`: ID of the running checkpoint.
    - `checkpointType`: type (`CHECKPOINT_TYPE`, savepoint, etc.).
    - `triggerTimestamp`: when it was triggered (ms).
    - `acknowledged`: number of subtasks that have ACKed.
    - `total`: total subtasks requiring ACK. | | `pipelines[].history` | Ring-buffer history (default 32 entries) ordered latest-first; each entry contains `pipelineId` plus checkpoint metadata. | Checkpoint metadata fields: | Field | Description | | --- | --- | | `checkpointId` | Checkpoint identifier. | | `checkpointType` | Checkpoint type. | | `status` | `COMPLETED`, `FAILED`, or `CANCELED`. | | `triggerTimestamp` | Trigger time in milliseconds. | | `completedTimestamp` | Completion time (only for success). | | `durationMillis` | Duration in milliseconds. | | `stateSize` | State size in bytes. | | `failureReason` | Failure/cancel reason, optional. | ### Get Job Checkpoint History
    GET /jobs/checkpoints/history/:jobId (Return checkpoint history records.) #### Query Parameters | Name | Description | | --- | --- | | `jobId` | Required job ID (path). | | `pipelineId` | Optional pipeline filter. | | `limit` | Optional limit (default 20). | | `status` | Optional status filter: `COMPLETED`, `FAILED`, `CANCELED`. | #### Response Example ```json [ { "pipelineId": 1, "checkpoint": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 } }, { "pipelineId": 1, "checkpoint": { "checkpointId": 8, "checkpointType": "CHECKPOINT_TYPE", "status": "FAILED", "triggerTimestamp": 1719999995000, "failureReason": "CHECKPOINT_EXPIRED" } } ] ```
    #### Field Description | Field | Description | | --- | --- | | `pipelineId` | ID of the pipeline to which the record belongs. | | `checkpoint` | Checkpoint metadata described above. | ================================================ FILE: docs/en/engines/zeta/security.md ================================================ # Security ## Basic Authentication You can secure your Web UI by enabling basic authentication. This will require users to enter a username and password when accessing the web interface. | Parameter Name | Required | Description | |----------------|----------|-------------| | `enable-basic-auth` | No | Whether to enable basic authentication, default is `false` | | `basic-auth-username` | No | The username for basic authentication, default is `admin` | | `basic-auth-password` | No | The password for basic authentication, default is `admin` | ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-basic-auth: true basic-auth-username: "your_username" basic-auth-password: "your_password" ``` ## HTTPS Configuration You can secure your REST-API-V2 service by enabling HTTPS. Both HTTP and HTTPS can be enabled simultaneously, or only one of them can be enabled. | Parameter Name | Required | Description | |----------------|----------|-------------| | `enable-http` | No | Whether to enable HTTP service, default is `true` | | `port` | No | HTTP service port, default is `8080` | | `enable-https` | No | Whether to enable HTTPS service, default is `false` | | `https-port` | No | HTTPS service port, default is `8443` | | `key-store-path` | Required when `enable-https` is `true` | Path to the KeyStore file, used to store the server's private key and certificate | | `key-store-password` | Required when `enable-https` is `true` | KeyStore password | | `key-manager-password` | Required when `enable-https` is `true` | KeyManager password, usually the same as the KeyStore password | | `trust-store-path` | No | Path to the TrustStore file, used to verify client certificates | | `trust-store-password` | No | TrustStore password | **Note**: When `trust-store-path` and `trust-store-password` are not empty, mutual SSL authentication (client authentication) will be enabled, requiring the client to provide a valid certificate. ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-https: true https-port: 8443 key-store-path: "${YOUR_KEY_STORE_PATH}" key-store-password: "${YOUR_KEY_STORE_PASSWORD}" key-manager-password: "${YOUR_KEY_MANAGER_PASSWORD}" # Optional: Mutual authentication trust-store-path: "${YOUR_TRUST_STORE_PATH}" trust-store-password: "${YOUR_TRUST_STORE_PASSWORD}" ``` ### Example of Generating Keys ```shell #!/bin/bash # Define the project root directory PROJECT_DIR="/Users/mac/IdeaProjects/data" # Define passwords SERVER_KEYSTORE_PASSWORD="server_keystore_password" SERVER_KEY_PASSWORD="server_keystore_password" CLIENT_KEYSTORE_PASSWORD="client_keystore_password" CLIENT_KEY_PASSWORD="client_keystore_password" SERVER_TRUSTSTORE_PASSWORD="server_truststore_password" CLIENT_TRUSTSTORE_PASSWORD="client_truststore_password" # Generate server keystore keytool -genkeypair \ -alias server \ -keyalg RSA \ -keysize 2048 \ -validity 365 \ -keystore "$PROJECT_DIR/server_keystore.jks" \ -storepass "$SERVER_KEYSTORE_PASSWORD" \ -keypass "$SERVER_KEY_PASSWORD" \ -dname "CN=localhost,OU=IT,O=MyCompany,L=Shanghai,ST=Shanghai,C=CN" # Export server certificate keytool -exportcert \ -alias server \ -keystore "$PROJECT_DIR/server_keystore.jks" \ -storepass "$SERVER_KEYSTORE_PASSWORD" \ -file "$PROJECT_DIR/server.crt" # Generate client keystore keytool -genkeypair \ -alias client \ -keyalg RSA \ -keysize 2048 \ -validity 365 \ -keystore "$PROJECT_DIR/client_keystore.jks" \ -storepass "$CLIENT_KEYSTORE_PASSWORD" \ -keypass "$CLIENT_KEY_PASSWORD" \ -dname "CN=client,OU=IT,O=MyCompany,L=Shanghai,ST=Shanghai,C=CN" # Export client certificate keytool -exportcert \ -alias client \ -keystore "$PROJECT_DIR/client_keystore.jks" \ -storepass "$CLIENT_KEYSTORE_PASSWORD" \ -file "$PROJECT_DIR/client.crt" # Create server truststore and import client certificate keytool -importcert \ -alias client \ -file "$PROJECT_DIR/client.crt" \ -keystore "$PROJECT_DIR/server_truststore.jks" \ -storepass "$SERVER_TRUSTSTORE_PASSWORD" \ -noprompt # Create client truststore and import server certificate keytool -importcert \ -alias server \ -file "$PROJECT_DIR/server.crt" \ -keystore "$PROJECT_DIR/client_truststore.jks" \ -storepass "$CLIENT_TRUSTSTORE_PASSWORD" \ -noprompt ``` ================================================ FILE: docs/en/engines/zeta/separated-cluster-deployment.md ================================================ --- sidebar_position: 6 --- # Deploy SeaTunnel Engine In Separated Cluster Mode The Master service and Worker service of SeaTunnel Engine are separated, and each service is a separate process. The Master node is only responsible for job scheduling, RESTful API, task submission, etc., and the Imap data is only stored on the Master node. The Worker node is only responsible for the execution of tasks and does not participate in the election to become the master nor stores Imap data. Among all the Master nodes, only one Master node works at the same time, and the other Master nodes are in the standby state. When the current Master node fails or the heartbeat times out, a new Master Active node will be elected from the other Master nodes. This is the most recommended usage method. In this mode, the load on the Master will be very low, and the Master has more resources for job scheduling, task fault tolerance index monitoring, and providing RESTful API services, etc., and will have higher stability. At the same time, the Worker node does not store Imap data. All Imap data is stored on the Master node. Even if the Worker node has a high load or crashes, it will not cause the Imap data to be redistributed. ## 1. Download [Download And Make SeaTunnel Installation Package](download-seatunnel.md) ## 2. Configure SEATUNNEL_HOME You can configure `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ## 3. Configure JVM Options For Master Nodes The JVM parameters of the Master node are configured in the `$SEATUNNEL_HOME/config/jvm_master_options` file. ```shell # JVM Heap -Xms2g -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ``` The JVM parameters of the Worker node are configured in the `$SEATUNNEL_HOME/config/jvm_worker_options` file. ```shell # JVM Heap -Xms2g -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ``` ## 4. Configure SeaTunnel Engine SeaTunnel Engine provides many functions and needs to be configured in `seatunnel.yaml`. ### 4.1 Setting the backup number of data in Imap (this parameter is not effective on the Worker node) SeaTunnel Engine implements cluster management based on [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/). The status data of the cluster (job running status, resource status) is stored in [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map). The data stored in Hazelcast IMap will be distributed and stored on all nodes of the cluster. Hazelcast partitions the data stored in Imap. Each partition can specify the number of backups. Therefore, SeaTunnel Engine can achieve cluster HA without using other services (such as zookeeper). The `backup count` is a parameter that defines the number of synchronous backups. For example, if it is set to 1, the backup of the partition will be placed on one other member. If it is set to 2, it will be placed on two other members. We recommend that the value of `backup-count` be `max(1, min(5, N/2))`. `N` is the number of cluster nodes. ```yaml seatunnel: engine: backup-count: 1 # other configurations ``` :::tip Since in the separated cluster mode, the Worker node does not store Imap data, the `backup-count` configuration of the Worker node is not effective. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file. At this time, the Worker node service will ignore the `backup-count` configuration. ::: ### 4.2 Slot configuration (this parameter is not effective on the Master node) The number of Slots determines the number of task groups that can be run in parallel on the cluster node. The number of Slots required by a task is formulated as N = 2 + P (parallelism configured by the task). By default, the number of Slots of SeaTunnel Engine is dynamic, that is, there is no limit on the number. We recommend that the number of slots be set to twice the number of CPU cores on the node, it's a default value when `dynamic-slot` is set to false and not set `slot-num`. The configuration of dynamic slot number (default) is as follows: ```yaml seatunnel: engine: slot-service: dynamic-slot: true # other configurations ``` The configuration of static slot number is as follows: ```yaml seatunnel: engine: slot-service: dynamic-slot: false slot-num: 20 ``` :::tip Since in the separated cluster mode, the Master node does not run tasks, so the Master service will not start the Slot service, and the `slot-service` configuration of the Master node is not effective. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file. At this time, the Master node service will ignore the `slot-service` configuration. ::: ### 4.3 Checkpoint Manager (This parameter is invalid on the Worker node) Just like Flink, the SeaTunnel Engine supports the Chandy–Lamport algorithm. Therefore, data synchronization without data loss and duplication can be achieved. **interval** The interval between two checkpoints, in milliseconds. If the `checkpoint.interval` parameter is configured in the `env` of the job configuration file, it will be subject to the setting in the job configuration file. **timeout** The timeout time of the checkpoint. If the checkpoint cannot be completed within the timeout time, it will trigger a checkpoint failure and the job fails. If the `checkpoint.timeout` parameter is configured in the `env` of the job configuration file, it will be subject to the setting in the job configuration file. **min-pause** The minimum pause (in milliseconds) between consecutive checkpoints. This ensures that checkpoints are not triggered too frequently. Example ```yaml seatunnel: engine: backup-count: 1 print-execution-info-interval: 10 slot-service: dynamic-slot: true checkpoint: interval: 300000 timeout: 10000 min-pause: 5000 ``` **checkpoint storage** The checkpoint is a fault-tolerant recovery mechanism. This mechanism ensures that when the program is running, even if it suddenly encounters an exception, it can recover by itself. The checkpoints are triggered regularly, and when each checkpoint is performed, each Task will be required to report its own state information (such as which offset has been read when reading Kafka) to the checkpoint thread, which writes it into a distributed storage (or shared storage). When the task fails and then automatically recovers from fault tolerance, or when recovering a previously paused task through the seatunnel.sh -r instruction, the state information of the corresponding job will be loaded from the checkpoint storage, and the job will be recovered based on these state information. If the number of nodes in the cluster is greater than 1, the checkpoint storage must be a distributed storage or a shared storage, so as to ensure that the task state information stored in it can still be loaded on another node after any node fails. :::tip The checkpoint configuration is only read by the Master service, and the Worker service will not read the checkpoint configuration. If the Master and Worker processes are started on the same machine, the Master and Worker will share the `seatunnel.yaml` configuration file, and at this time the Worker node service will ignore the `checkpoint` configuration. ::: For information about checkpoint storage, you can view [checkpoint storage](checkpoint-storage.md). ### 4.4 History Job Expiry Configuration The information of each completed job, such as status, counters, and error logs, is stored in an IMap object. As the number of running jobs increases, the memory will increase, and eventually the memory will overflow. Therefore, you can adjust the `history-job-expire-minutes` parameter to solve this problem. The time unit of this parameter is minutes. The default value is 1440 minutes, that is, one day. Example ```yaml seatunnel: engine: history-job-expire-minutes: 1440 ``` ### 4.5 Class Loader Cache Mode This configuration mainly solves the problem of resource leakage caused by continuously creating and attempting to destroy class loaders. If you encounter an exception related to metaspace space overflow, you can try to enable this configuration. In order to reduce the frequency of creating class loaders, after enabling this configuration, SeaTunnel will not try to release the corresponding class loader when the job is completed, so that it can be used by subsequent jobs, that is to say, when not too many types of Source/Sink connector are used in the running job, it is more effective. The default value is true. Example ```yaml seatunnel: engine: classloader-cache-mode: true ``` ### 4.6 Persistence Configuration of IMap (This parameter is invalid on the Worker node) :::tip Since in the separated cluster mode, only the Master node stores IMap data and the Worker node does not store IMap data, the Worker service will not read this parameter item. ::: In SeaTunnel, we use IMap (a distributed Map that can implement the writing and reading of data across nodes and processes. For detailed information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) to store the state of each task and its task, so that after the node where the task is located fails, the state information of the task before can be obtained on other nodes, thereby recovering the task and realizing the fault tolerance of the task. By default, the information of IMap is only stored in the memory, and we can set the number of replicas of IMap data. For specific reference (4.1 Setting the number of backups of data in IMap), if the number of replicas is 2, it means that each data will be simultaneously stored in 2 different nodes. Once the node fails, the data in IMap will be automatically replenished to the set number of replicas on other nodes. But when all nodes are stopped, the data in IMap will be lost. When the cluster nodes are started again, all previously running tasks will be marked as failed and need to be recovered manually by the user through the seatunnel.sh -r instruction. To solve this problem, we can persist the data in IMap to an external storage such as HDFS, OSS, etc. In this way, even if all nodes are stopped, the data in IMap will not be lost, and when the cluster nodes are started again, all previously running tasks will be automatically recovered. The following describes how to use the MapStore persistence configuration. For detailed information, please refer to [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) **type** The type of IMap persistence, currently only supports `hdfs`. **namespace** It is used to distinguish the data storage locations of different businesses, such as the OSS bucket name. **clusterName** This parameter is mainly used for cluster isolation. We can use it to distinguish different clusters, such as cluster1, cluster2, which is also used to distinguish different businesses. **fs.defaultFS** We use the hdfs api to read and write files, so providing the hdfs configuration is required for using this storage. If you use HDFS, you can configure it like this: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 ``` If there is no HDFS and your cluster has only one node, you can configure it like this to use local files: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: file:/// ``` If you use OSS, you can configure it like this: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: oss block.size: block size(bytes) oss.bucket: oss://bucket name/ fs.oss.accessKeyId: OSS access key id fs.oss.accessKeySecret: OSS access key secret fs.oss.endpoint: OSS endpoint ``` Notice: When using OSS, make sure that the following jars are in the lib directory. ``` aliyun-sdk-oss-3.13.2.jar hadoop-aliyun-3.3.6.jar jdom2-2.0.6.jar netty-buffer-4.1.89.Final.jar netty-common-4.1.89.Final.jar seatunnel-hadoop3-3.1.4-uber.jar ``` It is possible to utilize S3 for IMAP storage. The S3 configuration properties follow the Hadoop S3A filesystem (Native S3) standard. Specifically, we utilize the fs.s3a.access.key and fs.s3a.secret.key properties to ensure compatibility with existing Hadoop-based ecosystems. If you would like to use S3 compatible storage such as Minio, you can configure it like this: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /seatunnel/engine clusterName: seatunnel storage.type: s3 s3.bucket: s3a://your-bucket fs.defaultFS: s3a://your-bucket fs.s3a.endpoint: http://your-minio-endpoint:port fs.s3a.path.style.access: true fs.s3a.access.key: YOUR_ACCESS_KEY fs.s3a.secret.key: YOUR_SECRET_KEY fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ``` Notice: When using S3, make sure that the following jars are in the lib directory. ``` seatunnel-hadoop3-3.1.4-uber.jar seatunnel-hadoop-aws.jar ``` ### 4.7 Job Scheduling Strategy When resources are insufficient, the job scheduling strategy can be configured in the following two modes: 1. `WAIT`: Wait for resources to be available. 2. `REJECT`: Reject the job, default value. Example ```yaml seatunnel: engine: job-schedule-strategy: WAIT ``` When `dynamic-slot: true` is used, the `job-schedule-strategy: WAIT` configuration will become invalid and will be forcibly changed to `job-schedule-strategy: REJECT`, because this parameter is meaningless in dynamic slots. ### 4.8 Coordinator Service CoordinatorService responsible for the process of generating each job from a LogicalDag to an ExecutionDag, and then to a PhysicalDag. It ultimately creates the JobMaster for the job to handle scheduling, execution, and state monitoring. **core-thread-num** The corePoolSize of seatunnel coordinator job's executor cached thread pool **max-thread-num** The max job count can be executed at same time Example ```yaml coordinator-service: core-thread-num: 30 max-thread-num: 1000 ``` ### 4.9 Job Metrics Partition Count (This parameter is invalid on the Worker node) A new configuration option JOB_METRICS_PARTITION_COUNT controls the number of partitions used to store running job metrics in Hazelcast IMap. - Default: 1 (single key, backward compatible) - Usage: Increase this value to distribute metrics across multiple partitions and reduce contention when many tasks update metrics concurrently. Example: ```yaml seatunnel: engine: job-metrics-partition-count: 4 ``` This will distribute metrics across 4 partitions instead of using a single key. Increasing the partition count provides significant benefits when the number of tasks exceeds approximately 20,000. As a practical guideline, a partition count of around 1,000–2,000 tends to offer the best balance between reducing lock contention and minimizing overhead. It is recommended to start with this value and then adjust based on your cluster size and workload characteristics. Note: Increasing the partition count may improve concurrency under heavy contention, but setting it too high can introduce additional overhead in distribution and merging, which can reduce overall performance. The partition count should be configured before starting a job. Changing the partition count after a job has started may result in metric key mismatches, so it is recommended to restart Seatunnel after modifying this option. ## 5. Configuring SeaTunnel Engine Network Services All network-related configurations of the SeaTunnel Engine are in the `hazelcast-master.yaml` and `hazelcast-worker.yaml` files. ### 5.1 cluster-name SeaTunnel Engine nodes use the `cluster-name` to determine whether another node is in the same cluster as themselves. If the cluster names between two nodes are different, the SeaTunnel Engine will reject service requests. ### 5.2 network Based on [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), a SeaTunnel Engine cluster is a network composed of cluster members running the SeaTunnel Engine server. Cluster members automatically join together to form a cluster. This automatic joining is through the various discovery mechanisms used by cluster members to discover each other. Please note that after the cluster is formed, the communication between cluster members is always through TCP/IP regardless of the discovery mechanism used. The SeaTunnel Engine uses the following discovery mechanisms. #### tcp-ip You can configure the SeaTunnel Engine as a complete TCP/IP cluster. For configuration details, please refer to the [Discovering Members by TCP section](tcp.md). In the separated cluster mode, the Master and Worker services use different ports. Master node network configuration `hazelcast-master.yaml` ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - master-node-1:5801 - master-node-2:5801 - worker-node-1:5802 - worker-node-2:5802 port: auto-increment: false port: 5801 properties: hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ``` Worker node network configuration `hazelcast-worker.yaml` ```yaml hazelcast: cluster-name: seatunnel network: join: tcp-ip: enabled: true member-list: - master-node-1:5801 - master-node-2:5801 - worker-node-1:5802 - worker-node-2:5802 port: auto-increment: false port: 5802 properties: hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ``` TCP is the way we recommend to use in a standalone SeaTunnel Engine cluster. On the other hand, Hazelcast provides some other service discovery methods. For details, please refer to [hazelcast network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters). ## 6. Starting the SeaTunnel Engine Master Node It can be started using the `-d` parameter through the daemon. ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d -r master ``` The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-master.log`. ## 7. Starting The SeaTunnel Engine Worker Node It can be started using the `-d` parameter through the daemon. ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d -r worker ``` The logs will be written to `$SEATUNNEL_HOME/logs/seatunnel-engine-worker.log`. ## 8. Submit And Manage Jobs ### 8.1 Submit Jobs With The SeaTunnel Engine Client #### Installing The SeaTunnel Engine Client ##### Setting the `SEATUNNEL_HOME` the same as the server You can configure the `SEATUNNEL_HOME` by adding the `/etc/profile.d/seatunnel.sh` file. The content of `/etc/profile.d/seatunnel.sh` is as follows: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ##### Configuring The SeaTunnel Engine Client All configurations of the SeaTunnel Engine client are in the `hazelcast-client.yaml`. **cluster-name** The client must have the same `cluster-name` as the SeaTunnel Engine. Otherwise, the SeaTunnel Engine will reject the client's request. **network** All addresses of the SeaTunnel Engine Master nodes need to be added here. ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - master-node-1:5801 - master-node-2:5801 ``` #### Submitting And Managing Jobs Now that the cluster has been deployed, you can complete the job submission and management through the following tutorial: [Submitting And Managing Jobs](user-command.md). ### 8.2 Submit Jobs With The REST API The SeaTunnel Engine provides a REST API for submitting and managing jobs. For more information, please refer to [REST API V2](rest-api-v2.md) ================================================ FILE: docs/en/engines/zeta/slot-allocation-strategy.md ================================================ # Slot Allocation Strategy Slot allocation strategy is an important part of SeaTunnel Engine, which determines how SeaTunnel Engine allocates tasks to different slots. The slot allocation strategy is a configurable component, and users can configure the slot allocation strategy according to their needs. **Configuration method:** Set the parameter `slot-allocation-strategy`, optional values are `RANDOM`, `SYSTEM_LOAD`, `SLOT_RATIO`. Example: ```yaml seatunnel: engine: slot-service: slot-allocation-strategy: RANDOM ``` ## RANDOM (default value) The random allocation strategy is the default slot allocation strategy of SeaTunnel Engine, which randomly allocates tasks to different slots. ## SYSTEM_LOAD The system load strategy allocates slots based on the system load, dynamically adjusting the slot allocation according to the system load. ### 1. **Design of time weight** Time weight reflects the impact of time on scheduling priority: - Recent data is given higher weight, and historical data gradually decays. - Using the distribution $4, 2, 2, 1, 1$ and normalizing it, the time weight for each statistic is: $$ \text{Time weight ratio} = \frac{\text{Current weight}}{10} $$ > When the cluster is just started and there are less than 5 data points, normalization is done separately, and the calculation formula will be dynamically adjusted, which will not be elaborated here. ### 2. **Resource utilization calculation** Evaluate the idle rate of CPU and memory resources comprehensively according to the weight: $$ \text{Resource idle rate} = \frac{(1 - \text{CPU utilization}) \cdot \text{CPU weight} + (1 - \text{Memory utilization}) \cdot \text{Memory weight}}{\text{CPU weight} + \text{Memory weight}} $$ - $(1 - \text{CPU utilization})$ and $(1 - \text{Memory utilization})$ in the formula are idle rates. - The weights of CPU and memory can be adjusted according to specific needs (e.g., $0.6$ and $0.4$), flexibly adapting to different scenarios. ### 3. **Time decay and scheduling priority formula** After introducing time weight decay, the formula for calculating scheduling priority is: $$ \text{Comprehensive resource idle rate} = \sum_{i=1}^{5} \left( \frac{(1 - \text{CPU utilization}_i) \cdot \text{CPU weight} + (1 - \text{Memory utilization}_i) \cdot \text{Memory weight}}{\text{CPU weight} + \text{Memory weight}} \cdot \text{Time weight}_i \right) $$ ### 4. **Dynamic adjustment of resource idle rate for slot allocation** When allocating multiple slots, considering the real-time update and dynamic simulation of resource status (because the resource load of the same task will not change quickly): - **Resource ratio used by each slot** = (1 - Comprehensive resource idle rate) ÷ Number of allocated slots - Update the idle rate of the corresponding node after allocating the slot: $$ \text{Idle rate after slot allocation} = \text{Comprehensive resource idle rate} - \text{Resource ratio used by each slot} $$ - By default, a single slot uses 10% of resources (it is not known how much resources a slot occupies when it is first started, so it is set to 10% by default. The reason for not setting it too low is to prevent allocating too many resources and causing the node to be overloaded. The next time monitoring information is captured, it will be relatively accurate). This method makes scheduling more in line with the actual resource usage. ### 5. **Introduction of balance factor** Only dynamically adjusting the resource idle rate through slot allocation may also have errors. We introduce a balance factor based on the number of slots to measure the current load status of the node and avoid over-concentration of scheduling resource allocation: > This number can be counted in real-time to optimize the scheduling priority indicator. $$ \text{BalanceFactor}_i = 1 - \frac{S_{\text{used},i}}{S_{\text{total},i}} $$ - $S_{\text{used},i}$: Number of slots allocated to node $i$. - $S_{\text{total},i}$: Total number of slots of node $i$. Adjust the scheduling priority through the balance factor: $$ W_i = \alpha \cdot \text{Idle rate after slot allocation}_i + \beta \cdot \text{BalanceFactor}_i $$ **Parameter meaning**: - $\alpha$: Weight focusing on resource utilization: 0.7 - $\beta$: Weight of the balance factor to prevent single-point overload: 0.3 ### 6. **Dynamic adjustment logic** - Collect CPU and memory utilization regularly, maintaining the most recent 5 statistics. - Dynamically update weights for the same task, gradually decaying old data. - Dynamic balance based on slot usage. > Explanation: > For example, if we have two nodes and need to allocate 10 slots, A has 10 idle slots, and B has 20 idle slots. After calculating the weights of the 10 slots through steps 4 and 5, the weights of node A are higher than those of node B. > Then we still think that node A should allocate resources. This may be because the slot configuration of node B in the cluster is not optimal (the slot configuration of the worker node is too small). ## SLOT_RATIO The slot ratio strategy schedules based on the slot usage rate, with higher priority given to slots with lower usage rates. **Calculation logic**: 1. Get the total number of slots of the worker. 2. Get the number of unallocated slots. 3. Usage rate = (Total number of slots - Number of unallocated slots) / Total number of slots. ================================================ FILE: docs/en/engines/zeta/tcp.md ================================================ --- sidebar_position: 10 --- # TCP Network If multicast is not the preferred way of discovery for your environment, then you can configure SeaTunnel Engine to be a full TCP/IP cluster. When you configure SeaTunnel Engine to discover members by TCP/IP, you must list all or a subset of the members' host names and/or IP addresses as cluster members. You do not have to list all of these cluster members, but at least one of the listed members has to be active in the cluster when a new member joins. To configure your Hazelcast to be a full TCP/IP cluster, set the following configuration elements. See the tcp-ip element section for the full descriptions of the TCP/IP discovery configuration elements. - Set the enabled attribute of the tcp-ip element to true. - Provide your member elements within the tcp-ip element. The following is an example declarative configuration. ```yaml hazelcast: network: join: tcp-ip: enabled: true member-list: - machine1 - machine2 - machine3:5799 - 192.168.1.0-7 - 192.168.1.21 ``` As shown above, you can provide IP addresses or host names for member elements. You can also give a range of IP addresses, such as `192.168.1.0-7`. Instead of providing members line-by-line as shown above, you also have the option to use the members element and write comma-separated IP addresses, as shown below. `192.168.1.0-7,192.168.1.21` If you do not provide ports for the members, Hazelcast automatically tries the ports `5701`, `5702` and so on. ================================================ FILE: docs/en/engines/zeta/telemetry.md ================================================ --- sidebar_position: 14 --- # Telemetry Integrating `Metrices` through `Prometheus-exports` can better seamlessly connect to related monitoring platforms such as Prometheus and Grafana, improving the ability to monitor and alarm of the SeaTunnel cluster. You can configure telemetry's configurations in the `seatunnel.yaml` file. The following is an example declarative configuration. ```yaml seatunnel: engine: telemetry: metric: enabled: true # Whether open metrics export ``` ## Metrics The [metric text of prometheus](./telemetry/metrics.txt),which get from `http://{instanceHost}:5801/hazelcast/rest/instance/metrics`. The [metric text of openMetrics](./telemetry/openmetrics.txt),which get from `http://{instanceHost}:5801/hazelcast/rest/instance/openmetrics`. Available metrics include the following categories. Note: All metrics both have the same labelName `cluster`, that's value is the config of `hazelcast.cluster-name`. ### Node Metrics | MetricName | Type | Labels | DESCRIPTION | |-------------------------------------------|-------|------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------| | cluster_info | Gauge | **hazelcastVersion**, the version of hazelcast. **master**, seatunnel master address. | Cluster info | | cluster_time | Gauge | **hazelcastVersion**, the version of hazelcast. | Cluster time | | node_count | Gauge | - | Cluster node total count | | node_state | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | Whether is up of seatunnel node | | hazelcast_executor_executedCount | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor executedCount of seatunnel cluster node | | hazelcast_executor_isShutdown | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor isShutdown of seatunnel cluster node | | hazelcast_executor_isTerminated | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor isTerminated of seatunnel cluster node | | hazelcast_executor_maxPoolSize | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor maxPoolSize of seatunnel cluster node | | hazelcast_executor_poolSize | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor poolSize of seatunnel cluster node | | hazelcast_executor_queueRemainingCapacity | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor queueRemainingCapacity of seatunnel cluster node | | hazelcast_executor_queueSize | Gauge | **type**, the type of executor, including: "async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | The hazelcast executor queueSize of seatunnel cluster node | | hazelcast_partition_partitionCount | Gauge | - | The partitionCount of seatunnel cluster node | | hazelcast_partition_activePartition | Gauge | - | The activePartition of seatunnel cluster node | | hazelcast_partition_isClusterSafe | Gauge | - | Whether is cluster safe of partition | | hazelcast_partition_isLocalMemberSafe | Gauge | - | Whether is local member safe of partition | ### Thread Pool Status | MetricName | Type | Labels | DESCRIPTION | |-------------------------------------|---------|--------------------------------------------------------------------|--------------------------------------------------------------------------------| | job_thread_pool_activeCount | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | The activeCount of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_corePoolSize | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | The corePoolSize of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_maximumPoolSize | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | The maximumPoolSize of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_poolSize | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | The poolSize of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_queueTaskCount | Gauge | **address**, server instance address,for example: "127.0.0.1:5801" | The queueTaskCount of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_completedTask_total | Counter | **address**, server instance address,for example: "127.0.0.1:5801" | The completedTask of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_task_total | Counter | **address**, server instance address,for example: "127.0.0.1:5801" | The taskCount of seatunnel coordinator job's executor cached thread pool | | job_thread_pool_rejection_total | Counter | **address**, server instance address,for example: "127.0.0.1:5801" | The rejectionCount of seatunnel coordinator job's executor cached thread pool | | ### Job info detail | MetricName | Type | Labels | DESCRIPTION | |------------|-------|-----------------------------------------------------------------------------------------------------------------------------|-------------------------------------| | job_count | Gauge | **type**, the type of job, including: "canceled" "cancelling" "created" "failed" "failing" "finished" "running" "scheduled" | All job counts of seatunnel cluster | ### JVM Metrics | MetricName | Type | Labels | DESCRIPTION | |--------------------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------| | jvm_threads_current | Gauge | - | Current thread count of a JVM | | jvm_threads_daemon | Gauge | - | Daemon thread count of a JVM | | jvm_threads_peak | Gauge | - | Peak thread count of a JVM | | jvm_threads_started_total | Counter | - | Started thread count of a JVM | | jvm_threads_deadlocked | Gauge | - | Cycles of JVM-threads that are in deadlock waiting to acquire object monitors or ownable synchronizers | | jvm_threads_deadlocked_monitor | Gauge | - | Cycles of JVM-threads that are in deadlock waiting to acquire object monitors | | jvm_threads_state | Gauge | **state**, the state of jvm thread, including: "NEW" "TERMINATED" "RUNNABLE" "BLOCKED" "WAITING" "TIMED_WAITING" "UNKNOWN" | Current count of threads by state | | jvm_classes_currently_loaded | Gauge | - | The number of classes that are currently loaded in the JVM | | jvm_classes_loaded_total | Counter | - | The total number of classes that have been loaded since the JVM has started execution | | jvm_classes_unloaded_total | Counter | - | The total number of classes that have been unloaded since the JVM has started execution | | jvm_memory_pool_allocated_bytes_total | Counter | **pool**,including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Total bytes allocated in a given JVM memory pool. Only updated after GC, not continuously | | jvm_gc_collection_seconds_count | Summary | **gc**,including: "PS Scavenge" "PS MarkSweep" | Time spent in a given JVM garbage collector in seconds | | jvm_gc_collection_seconds_sum | Summary | **gc**,including: "PS Scavenge" "PS MarkSweep" | Time spent in a given JVM garbage collector in seconds | | jvm_info | Gauge | **runtime**, for example: "Java(TM) SE Runtime Environment". **vendor**, for example: "Oracle Corporation". **version** ,for example: "1.8.0_212-b10" | VM version info | | process_cpu_seconds_total | Counter | - | Total user and system CPU time spent in seconds | | process_start_time_seconds | Gauge | - | Start time of the process since unix epoch in seconds | | process_open_fds | Gauge | - | Number of open file descriptors | | process_max_fds | Gauge | - | Maximum number of open file descriptors | | jvm_memory_objects_pending_finalization | Gauge | - | The number of objects waiting in the finalizer queue | | jvm_memory_bytes_used | Gauge | **area**, including: "heap" "noheap" | Used bytes of a given JVM memory area | | jvm_memory_bytes_committed | Gauge | **area**, including: "heap" "noheap" | Committed (bytes) of a given JVM memory area | | jvm_memory_bytes_max | Gauge | **area**, including:"heap" "noheap" | Max (bytes) of a given JVM memory area | | jvm_memory_bytes_init | Gauge | **area**, including:"heap" "noheap" | Initial bytes of a given JVM memory area | | jvm_memory_pool_bytes_used | Gauge | **pool**, including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Used bytes of a given JVM memory pool | | jvm_memory_pool_bytes_committed | Gauge | **pool**, including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Committed bytes of a given JVM memory pool | | jvm_memory_pool_bytes_max | Gauge | **pool**, including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Max bytes of a given JVM memory pool | | jvm_memory_pool_bytes_init | Gauge | **pool**, including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Initial bytes of a given JVM memory pool | | jvm_memory_pool_allocated_bytes_created | Gauge | **pool**, including: "Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | Total bytes allocated in a given JVM memory pool. Only updated after GC, not continuously | | jvm_memory_pool_collection_used_bytes | Gauge | **pool**, including: "PS Eden Space" "PS Old Ge" "PS Survivor Space" | Used bytes after last collection of a given JVM memory pool | | jvm_memory_pool_collection_committed_bytes | Gauge | **pool**, including: "PS Eden Space" "PS Old Ge" "PS Survivor Space" | Committed after last collection bytes of a given JVM memory pool | | jvm_memory_pool_collection_max_bytes | Gauge | **pool**, including: "PS Eden Space" "PS Old Ge" "PS Survivor Space" | Max bytes after last collection of a given JVM memory pool | | jvm_memory_pool_collection_init_bytes | Gauge | **pool**, including: "PS Eden Space" "PS Old Ge" "PS Survivor Space" | Initial after last collection bytes of a given JVM memory pool | | jvm_buffer_pool_used_bytes | Gauge | **pool**, including: "direct" "mapped" | Used bytes of a given JVM buffer pool | | jvm_buffer_pool_capacity_bytes | Gauge | **pool**, including: "direct" "mapped" | Bytes capacity of a given JVM buffer pool | | jvm_buffer_pool_used_buffers | Gauge | **pool**, including: "direct" "mapped" | Used buffers of a given JVM buffer pool | ## Cluster Monitoring By Prometheus & Grafana ### Install Prometheus For a guide on how to set up Prometheus server go to the [Installation](https://prometheus.io/docs/prometheus/latest/installation) ### Configuration Prometheus Add seatunnel instance metric exports into `/etc/prometheus/prometheus.yaml`. For example: ```yaml global: # How frequently to scrape targets from this job. scrape_interval: 15s scrape_configs: # The job name assigned to scraped metrics by default. - job_name: 'seatunnel' scrape_interval: 5s # Metrics export path metrics_path: /hazelcast/rest/instance/metrics # List of labeled statically configured targets for this job. static_configs: # The targets specified by the static config. - targets: [ 'localhost:5801' ] # Labels assigned to all metrics scraped from the targets. # labels: [:] ``` ### Install Grafana For a guide on how to set up Grafana server go to the [Installation](https://grafana.com/docs/grafana/latest/setup-grafana/installation) ### Monitoring Dashboard - Add Prometheus DataSource on Grafana. - Import `Seatunnel Cluster` monitoring dashboard by [Dashboard JSON](./telemetry/grafana-dashboard.json) into Grafana. The [effect image](../../images/grafana.png) of the dashboard ================================================ FILE: docs/en/engines/zeta/tuning-guide.md ================================================ --- sidebar_position: 15 --- # Tuning Guide This article introduces the tuning methods of SeaTunnel Engine to help users optimize the performance and stability of SeaTunnel Engine according to their actual needs. Before reading this guide, please note that the recommendations here are summarized from real-world usage by most users and may not be suitable for all scenarios. You can adjust them according to your actual situation. SeaTunnel Engine is a data integration engine running on the [JVM](https://en.wikipedia.org/wiki/Java_virtual_machine), so JVM tuning is also applicable to SeaTunnel Engine and will not be repeated here. ## Cluster Slow Response or Hang ### JVM If the SeaTunnel Engine cluster responds slowly or hangs, it may be due to insufficient JVM heap memory. You can troubleshoot as follows: #### Insufficient Heap Memory ##### Troubleshooting Process 1. Check JVM heap memory usage in real time Use the `jcmd` command to check JVM heap memory usage, where `` is the PID of the SeaTunnel Engine process. ```bash jmap -heap ``` Example output: ```shell Attaching to process ID 2111950, please wait... Debugger attached successfully. Server compiler detected. JVM version is 25.192-b12 using thread-local object allocation. Garbage-First (G1) GC with 13 thread(s) Heap Configuration: MinHeapFreeRatio = 40 MaxHeapFreeRatio = 70 MaxHeapSize = 17179869184 (16384.0MB) NewSize = 1363144 (1.2999954223632812MB) MaxNewSize = 10301210624 (9824.0MB) OldSize = 5452592 (5.1999969482421875MB) NewRatio = 2 SurvivorRatio = 8 MetaspaceSize = 21807104 (20.796875MB) CompressedClassSpaceSize = 1073741824 (1024.0MB) MaxMetaspaceSize = 2147483648 (2048.0MB) G1HeapRegionSize = 8388608 (8.0MB) Heap Usage: G1 Heap: regions = 2048 capacity = 17179869184 (16384.0MB) used = 2997548048 (2858.684585571289MB) free = 14182321136 (13525.315414428711MB) 17.448026034981012% used G1 Young Generation: Eden Space: regions = 348 capacity = 10737418240 (10240.0MB) used = 2919235584 (2784.0MB) free = 7818182656 (7456.0MB) 27.1875% used Survivor Space: regions = 10 capacity = 83886080 (80.0MB) used = 83886080 (80.0MB) free = 0 (0.0MB) 100.0% used G1 Old Generation: regions = 0 capacity = 6358564864 (6064.0MB) used = 0 (0.0MB) free = 6358564864 (6064.0MB) 0.0% used ``` Pay attention to the usage of G1 Old Generation. If the usage rate of Old Generation is close to 100%, it may be caused by insufficient heap memory. 2. Check the logs The system will periodically output health monitoring logs. Check the SeaTunnel Engine logs to see if there are frequent Full GCs or long GC pauses, which may be caused by insufficient heap memory. Example log: ```log [] 2025-07-04 16:42:54,818 INFO [c.h.i.d.HealthMonitor ] [hz.main.HealthMonitor] - [127.0.0.1]:5801 [seatunnel] [5.1] processors=16, physical.memory.total=31.1G, physical.memory.free=9.7G, swap.space.total=0, swap.space.free=0, heap.memory.used=198.7M, heap.memory.free=15.8G, heap.memory.total=16.0G, heap.memory.max=16.0G, heap.memory.used/total=1.21%, heap.memory.used/max=1.21%, minor.gc.count=2, minor.gc.time=44ms, major.gc.count=0, major.gc.time=0ms, load.process=0.00%, load.system=66.67%, load.systemAverage=5.66, thread.count=118, thread.peakCount=118, cluster.timeDiff=0, event.q.size=0, executor.q.async.size=0, executor.q.client.size=0, executor.q.client.query.size=0, executor.q.client.blocking.size=0, executor.q.query.size=0, executor.q.scheduled.size=0, executor.q.io.size=0, executor.q.system.size=0, executor.q.operations.size=0, executor.q.priorityOperation.size=0, operations.completed.count=13, executor.q.mapLoad.size=0, executor.q.mapLoadAllKeys.size=0, executor.q.cluster.size=0, executor.q.response.size=0, operations.running.count=0, operations.pending.invocations.percentage=0.00%, operations.pending.invocations.count=0, proxy.count=9, clientEndpoint.count=0, connection.active.count=0, client.connection.count=0, connection.count=0 ``` Focus on: - `heap.memory.used/max`: Heap memory usage rate. If it is close to 100%, it may be due to insufficient heap memory. - `major.gc.count` and `major.gc.time`: If Full GC is frequent, it may be caused by insufficient heap memory. You can judge whether there are frequent Full GCs or long GC pauses by continuously checking the logs. ##### Solutions Reduce memory usage at the same time by lowering task concurrency and the number of tasks. If you do need more memory, please refer to [Deployment](deployment.md) for configuring SeaTunnel Engine JVM options to increase memory. ##### Unlimited Memory Usage 1. Generate a memory snapshot Sometimes, even with a fixed number of tasks, memory usage keeps increasing, which may be caused by a memory leak in the task. Please dump the corresponding memory snapshot information. ```shell jmap -dump:live,format=b,file=heap.hprof ``` Then use tools such as [Eclipse Memory Analyzer](https://www.eclipse.org/mat/) to analyze the memory snapshot and find the cause of the memory leak. For users or connectors who are not secondary developers, you can also create an issue and attach the memory snapshot, and we will help you analyze it. 2. Print object occupancy ranking Sometimes, generating a memory snapshot may fail due to JVM hang. In this case, you can try to print the object occupancy ranking to check memory usage. ```shell jmap -histo:live | head -n 100 ``` Similarly, you can analyze the output to find the cause of the memory leak. For users or connectors who are not secondary developers, you can also create an issue and attach the object occupancy information, and we will help you analyze it. #### High CPU Usage High CPU usage is also a common cause of cluster node hangs, but it is less likely than high memory usage. You can troubleshoot as follows: ##### Troubleshooting Process 1. Check CPU usage - Use the `top` or `htop` command to check the CPU usage of the SeaTunnel Engine process. - If the CPU usage is close to 100%, it may be due to insufficient CPU resources. If there are multiple cores, consider the usage of all cores. ##### Solutions If CPU usage is too high, you can try the following solutions: - Reduce task concurrency and the number of tasks to reduce CPU resource usage. - Increase the number of cluster nodes to share the CPU resource load. ### Hazelcast Hazelcast-related configuration is also an important factor affecting the performance of SeaTunnel Engine. You can modify the configuration parameters in the `hazelcast.yaml` series of files. Please refer to [Deployment](deployment.md). Here are some common tuning parameters: - `hazelcast.operation.generic.thread.count`: This parameter controls the number of generic operation threads in Hazelcast. SeaTunnel Engine uses this thread for executing RPC requests. You can adjust this parameter according to your actual situation to improve the performance of Hazelcast RPC. If you frequently see logs like the following and the CPU usage is not very high, try increasing this parameter: ```log 2024-09-03 06:15:45,807 WARN [.s.i.o.s.SlowOperationDetector] [hz.main.SlowOperationDetectorThread] - [seatunnel-worker-1]:5802 [seatunnel] [5.1] Slow operation detected: ``` ================================================ FILE: docs/en/engines/zeta/user-command.md ================================================ --- sidebar_position: 13 --- # Client Command Line Tool The SeaTunnel Engine provides a command line tool for managing the jobs of the SeaTunnel Engine. You can use the command line tool to submit, stop, pause, resume, delete jobs, view job status and monitoring metrics, etc. You can obtain the help information of the command line tool through the following command: ```shell sh bin/seatunnel.sh -h ``` The output is as follows: ``` Usage: seatunnel.sh [options] Options: --async Run the job asynchronously. When the job is submitted, the client will exit (default: false). -can, --cancel, --cancel-job Cancel the job(s) by JobId. -f, --force-cancel, --force-cancel-job Force Cancel job(s) by JobId. --check Whether to check the config (default: false). -cj, --close, --close-job Close the client and the task will also be closed (default: true). -cn, --cluster The name of the cluster. -c, --config Config file. --decrypt Decrypt the config file. When both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false). -m, --master, -e, --deploy-mode SeaTunnel job submit master, support [local, cluster] (default: cluster). --encrypt Encrypt the config file. When both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false). --get_running_job_metrics Get metrics for running jobs (default: false). -h, --help Show the usage message. -j, --job-id Get the job status by JobId. -l, --list List the job status (default: false). --metrics Get the job metrics by JobId. -n, --name The SeaTunnel job name (default: SeaTunnel). -r, --restore, --restore-job Restore with savepoint by jobId. -s, --savepoint, --savepoint-job Savepoint the job by jobId. -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318. We use ',' as a separator. When inside "", ',' are treated as normal characters instead of delimiters. (default: []). ``` ## Submitting Jobs ```shell sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template ``` The **--async** parameter allows the job to run in the background. When the job is submitted, the client will exit. ```shell sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async ``` The **-n** or **--name** parameter can specify the name of the job. ```shell sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async -n myjob ``` ## Viewing The Job List ```shell sh bin/seatunnel.sh -l ``` This command will output the list of all jobs in the current cluster (including completed historical jobs and running jobs). ## Viewing The Job Status ```shell sh bin/seatunnel.sh -j ``` This command will output the status information of the specified job. ## Getting The Monitoring Information Of Running Jobs ```shell sh bin/seatunnel.sh --get_running_job_metrics ``` This command will output the monitoring information of running jobs. ## Getting the Monitoring Information of a Specified Job The --metrics parameter can get the monitoring information of a specified job. ```shell sh bin/seatunnel.sh --metrics ``` ## Pausing Jobs ```shell sh bin/seatunnel.sh -s ``` This command will pause the specified job. Note that only jobs with checkpoints enabled support pausing jobs (real-time synchronization jobs have checkpoints enabled by default, and batch jobs do not have checkpoints enabled by default and need to configure checkpoint.interval in `env` to enable checkpoints). Pausing a job is in the smallest unit of split. That is, after pausing a job, it will wait for the currently running split to finish running and then pause. After the task is resumed, it will continue to run from the paused split. ## Resuming Jobs ```shell sh bin/seatunnel.sh -r -c $SEATUNNEL_HOME/config/v2.batch.config.template ``` This command will resume the specified job. Note that only jobs with checkpoints enabled support resuming jobs (real-time synchronization jobs have checkpoints enabled by default, and batch jobs do not have checkpoints enabled by default and need to configure checkpoint.interval in `env` to enable checkpoints). Resuming a job requires the jobId and the configuration file of the job. Both failed jobs and jobs paused by seatunnel.sh -s <jobId> can be resumed by this command. ## Canceling Jobs ```shell sh bin/seatunnel.sh -can [ ...] ``` This command will cancel the specified job. After canceling the job, the job will be stopped and its status will become `CANCELED`. Supports batch cancellation of jobs, and can cancel multiple jobs at one time. All breakpoint information of the canceled job will be deleted and cannot be resumed by seatunnel.sh -r <jobId>. ## Force Canceling Jobs ```shell sh bin/seatunnel.sh -f [ ...] ``` This command forcefully cancels the specified job(s). After cancellation, the job will be stopped and its status will be set to `CANCELED`. This command supports batch operations and allows multiple jobs to be force-canceled at once. All breakpoint information of the canceled job will be deleted and cannot be resumed by seatunnel.sh -r <jobId>. **Notes:** - If the job status is `DOING_SAVEPOINT` and the savepoint does not complete successfully, a forced stop (When the `force` option is enabled) will set the job status to `CANCELED`. - A forced stop may leave checkpoint data incomplete or in an inconsistent state. It should be used only for exceptional or abnormal situations. ## Configure The JVM Options We can configure the JVM options for the SeaTunnel Engine client in the following ways: 1. Add the JVM options to `$SEATUNNEL_HOME/config/jvm_client_options`. Modify the JVM parameters in the `$SEATUNNEL_HOME/config/jvm_client_options` file. Please note that the JVM parameters in this file will be applied to all jobs submitted using `seatunnel.sh`, including Local Mode and Cluster Mode. 2. Add JVM options when submitting jobs. For example, `sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -DJvmOption="-Xms2G -Xmx2G"` # Server Command Line Tool SeaTunnel Engine provides server management commands for starting, stopping, and managing SeaTunnel Engine cluster nodes. ```shell sh bin/seatunnel-cluster.sh -h ``` Server commands support the following parameters: ```shell Usage: seatunnel-cluster.sh [options] Options: -cn, --cluster The name of cluster. -d, --daemon The cluster daemon mode. -r, --role The cluster node role, support [master, worker, master_and_worker] (default: master_and_worker). -m, --member Show cluster members information. -h, --help Show the usage message. ``` ## Start cluster You can get help information for server commands with the following command: ```shell # Start in foreground sh bin/seatunnel-cluster.sh # Start in daemon mode sh bin/seatunnel-cluster.sh -d ``` ## Show cluster members information You can view cluster members information using the following command: ```shell sh bin/seatunnel-cluster.sh -m -cn my_cluster ``` This command will output detailed information about all members in the cluster, including: - **Member ID**: Unique identifier for each cluster member - **Address**: IP address and port of the member - **Role**: Member role (ACTIVE MASTER, MASTER, or WORKER) - **Version**: Hazelcast version running on the member **Example output:** ``` Member ID Address Role Version a1b2c3d4-e5f6-7890-abcd-ef1234567890 192.168.1.100:5701 ACTIVE MASTER 5.3.0 b2c3d4e5-f6g7-8901-bcde-f23456789012 192.168.1.101:5701 MASTER 5.3.0 c3d4e5f6-g7h8-9012-cdef-345678901234 192.168.1.102:5701 WORKER 5.3.0 ``` **Note**: You must specify the cluster name with the `-cn` parameter. The cluster must be running for this command to work. ## Stop cluster SeaTunnel provides a dedicated stop script to shut down cluster nodes: ```shell sh bin/stop-seatunnel-cluster.sh -h ``` The stop command supports the following parameters: ```shell Usage: stop-seatunnel-cluster.sh [options] Options: -cn, --cluster The name of the cluster to shut down (default: seatunnel_default_cluster) -h, --help Show the usage message ``` ### Stop default cluster ```shell # Stop the default cluster (seatunnel_default_cluster) sh bin/stop-seatunnel-cluster.sh ``` ### Stop specified cluster ```shell # Stop a cluster with specified name sh bin/stop-seatunnel-cluster.sh -cn my_cluster ``` ================================================ FILE: docs/en/engines/zeta/web-ui.md ================================================ # Web UI ## Access Before accessing the web ui we need to enable the http rest api. first you need to configure it in the `seatunnel.yaml` configuration file ``` seatunnel: engine: http: enable-http: true port: 8080 ``` Then visit `http://ip:8080/#/overview` ## Overview The Web UI of Apache SeaTunnel offers a user-friendly interface for monitoring and managing SeaTunnel jobs. Through the Web UI, users can view real-time information on currently running jobs, finished jobs, and the status of worker and master nodes within the cluster. The main functional modules include Jobs, Workers, and Master, each providing detailed status information and operational options to help users efficiently manage and optimize their data processing workflows. ![overview.png](../../../images/ui/overview.png) ## Jobs ### Running Jobs The "Running Jobs" section lists all SeaTunnel jobs that are currently in execution. Users can view basic information for each job, including Job ID, submission time, status, execution time, and more. By clicking on a specific job, users can access detailed information such as task distribution, resource utilization, and log outputs, allowing for real-time monitoring of job progress and timely handling of potential issues. ![running.png](../../../images/ui/running.png) ![detail.png](../../../images/ui/detail.png) ### Finished Jobs The "Finished Jobs" section displays all SeaTunnel jobs that have either successfully completed or failed. This section provides execution results, completion times, durations, and failure reasons (if any) for each job. Users can review past job records through this module to analyze job performance, troubleshoot issues, or rerun specific jobs as needed. ![finished.png](../../../images/ui/finished.png) ## Workers ### Workers Information The "Workers" section displays detailed information about all worker nodes in the cluster, including each worker's address, running status, CPU and memory usage, number of tasks being executed, and more. Through this module, users can monitor the health of each worker node, promptly identify and address resource bottlenecks or node failures, ensuring the stable operation of the SeaTunnel cluster. ![workers.png](../../../images/ui/workers.png) ## Master ### Master Information The "Master" section provides the status and configuration information of the master node in the SeaTunnel cluster. Users can view the master's address, running status, job scheduling responsibilities, and overall resource allocation within the cluster. This module helps users gain a comprehensive understanding of the cluster's core management components, facilitating cluster configuration optimization and troubleshooting. ![master.png](../../../images/ui/master.png) ================================================ FILE: docs/en/faq.md ================================================ # FAQ ## What data sources and destinations does SeaTunnel support? SeaTunnel supports various data sources and destinations. You can find a detailed list on the following list: - Supported data sources (Source): [Source List](https://seatunnel.apache.org/docs/connectors/source) - Supported data destinations (Sink): [Sink List](https://seatunnel.apache.org/docs/connectors/sink) ## Does SeaTunnel support batch and streaming processing? SeaTunnel supports both batch and streaming processing modes. You can select the appropriate mode based on your specific business scenarios and needs. Batch processing is suitable for scheduled data integration tasks, while streaming processing is ideal for real-time integration and Change Data Capture (CDC). ## Is it necessary to install engines like Spark or Flink when using SeaTunnel? Spark and Flink are not mandatory. SeaTunnel supports Zeta, Spark, and Flink as integration engines, allowing you to choose one based on your needs. The community highly recommends Zeta, a new generation high-performance integration engine specifically designed for integration scenarios. Zeta is affectionately called "Ultraman Zeta" by community users! The community offers extensive support for Zeta, making it the most feature-rich option. ## What data transformation functions does SeaTunnel provide? SeaTunnel supports multiple data transformation functions, including field mapping, data filtering, data format conversion, and more. You can implement data transformations through the `transform` module in the configuration file. For more details, refer to the SeaTunnel [Transform Documentation](https://seatunnel.apache.org/docs/transforms). ## Can SeaTunnel support custom data cleansing rules? Yes, SeaTunnel supports custom data cleansing rules. You can configure custom rules in the `transform` module, such as cleaning up dirty data, removing invalid records, or converting fields. ## Does SeaTunnel support real-time incremental integration? SeaTunnel supports incremental data integration. For example, the CDC connector allows real-time capture of data changes, which is ideal for scenarios requiring real-time data integration. ## What CDC data sources are currently supported by SeaTunnel? SeaTunnel currently supports MongoDB CDC, MySQL CDC, OpenGauss CDC, Oracle CDC, PostgreSQL CDC, SQL Server CDC, TiDB CDC, and more. For more details, refer to the [Source List](https://seatunnel.apache.org/docs/connectors/source). ## How do I enable permissions required for SeaTunnel CDC integration? Please refer to the official SeaTunnel documentation for the necessary steps to enable permissions for each connector’s CDC functionality. ## Does SeaTunnel support CDC from MySQL replicas? How are logs pulled? Yes, SeaTunnel supports CDC from MySQL replicas by subscribing to binlog logs, which are then parsed on the SeaTunnel server. ## Does SeaTunnel support CDC integration for tables without primary keys? SeaTunnel does not support CDC integration for tables without primary keys. The reason is that if two identical records exist in the upstream and one is deleted or modified, the downstream cannot determine which record to delete or modify, leading to potential issues. Primary keys are essential to ensure data uniqueness. ## Does SeaTunnel support automatic table creation? Before starting an integration task, you can select different handling schemes for existing table structures on the target side, controlled via the `schema_save_mode` parameter. Available options include: - **`RECREATE_SCHEMA`**: Creates the table if it does not exist; if the table exists, it is deleted and recreated. - **`CREATE_SCHEMA_WHEN_NOT_EXIST`**: Creates the table if it does not exist; skips creation if the table already exists. - **`ERROR_WHEN_SCHEMA_NOT_EXIST`**: Throws an error if the table does not exist. - **`IGNORE`**: Ignores table handling. Many connectors currently support automatic table creation. Refer to the specific connector documentation, such as [Jdbc sink](https://seatunnel.apache.org/docs/connectors/sink/Jdbc/#schema_save_mode-enum), for more information. ## Does SeaTunnel support handling existing data before starting a data integration task? Yes, you can specify different processing schemes for existing data on the target side before starting an integration task, controlled via the `data_save_mode` parameter. Available options include: - **`DROP_DATA`**: Retains the database structure but deletes the data. - **`APPEND_DATA`**: Retains both the database structure and data. - **`CUSTOM_PROCESSING`**: User-defined processing. - **`ERROR_WHEN_DATA_EXISTS`**: Throws an error if data already exists. Many connectors support handling existing data; please refer to the respective connector documentation, such as [Jdbc sink](https://seatunnel.apache.org/docs/connectors/sink/Jdbc#data_save_mode-enum). ## Does SeaTunnel support exactly-once consistency? SeaTunnel supports exactly-once consistency for some data sources, such as MySQL and PostgreSQL, ensuring data consistency during integration. Note that exactly-once consistency depends on the capabilities of the underlying database. ## Can SeaTunnel execute scheduled tasks? You can use Linux cron jobs to achieve periodic data integration, or leverage scheduling tools like Apache DolphinScheduler or Apache Airflow to manage complex scheduled tasks. ## I encountered an issue with SeaTunnel that I cannot resolve. What should I do? If you encounter issues with SeaTunnel, here are a few ways to get help: 1. Search the [Issue List](https://github.com/apache/seatunnel/issues) or [Mailing List](https://lists.apache.org/list.html?dev@seatunnel.apache.org) to see if someone else has faced a similar issue. 2. If you cannot find an answer, reach out to the community through [these methods](https://github.com/apache/seatunnel#contact-us). ## How do I declare variables? Would you like to declare a variable in SeaTunnel's configuration and dynamically replace it at runtime? This feature is commonly used in both scheduled and ad-hoc offline processing to replace time, date, or other variables. Here's an example: Define the variable in the configuration. For example, in an SQL transformation (the value in any "key = value" pair in the configuration file can be replaced with variables): ```plaintext ... transform { Sql { query = "select * from dual where city ='${city}' and dt = '${date}'" } } ... ``` To start SeaTunnel in Zeta Local mode with variables: ```bash $SEATUNNEL_HOME/bin/seatunnel.sh \ -c $SEATUNNEL_HOME/config/your_app.conf \ -m local[2] \ -i city=Singapore \ -i date=20231110 ``` Use the `-i` or `--variable` parameter with `key=value` to specify the variable's value, where `key` matches the variable name in the configuration. For details, see: [SeaTunnel Variable Configuration](https://seatunnel.apache.org/docs/introduction/concepts/config) ## How can I write multi-line text in the configuration file? If the text is long and needs to be wrapped, you can use triple quotes to indicate the beginning and end: ```plaintext var = """ Apache SeaTunnel is a next-generation high-performance, distributed, massive data integration tool. """ ``` ## How do I perform variable substitution in multi-line text? Performing variable substitution in multi-line text can be tricky because variables cannot be enclosed within triple quotes: ```plaintext var = """ your string 1 """${your_var}""" your string 2""" ``` For more details, see: [lightbend/config#456](https://github.com/lightbend/config/issues/456). ## Where should I start if I want to learn SeaTunnel source code? SeaTunnel features a highly abstracted and well-structured architecture, making it an excellent choice for learning big data architecture. You can start by exploring and debugging the `seatunnel-examples` module: `SeaTunnelEngineLocalExample.java`. For more details, refer to the [SeaTunnel Contribution Guide](https://seatunnel.apache.org/docs/developer/setup). ## Do I need to understand all of SeaTunnel’s source code if I want to develop my own source, sink, or transform? No, you only need to focus on the interfaces for source, sink, and transform. If you want to develop your own connector (Connector V2) for the SeaTunnel API, refer to the **[Connector Development Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.md)**. ================================================ FILE: docs/en/getting-started/docker/docker.md ================================================ --- sidebar_position: 3 --- # Set Up With Docker ## Set Up With Docker In Local Mode ### Zeta Engine #### Download ```shell docker pull apache/seatunnel: ``` How to submit job in local mode ```shell # Run fake source to console sink docker run --rm -it apache/seatunnel: ./bin/seatunnel.sh -m local -c config/v2.batch.config.template # Run job with custom config file docker run --rm -it -v //:/config apache/seatunnel: ./bin/seatunnel.sh -m local -c /config/fake_to_console.conf # Example # If you config file is in /tmp/job/fake_to_console.conf docker run --rm -it -v /tmp/job/:/config apache/seatunnel: ./bin/seatunnel.sh -m local -c /config/fake_to_console.conf # Set JVM options when running docker run --rm -it -v /tmp/job/:/config apache/seatunnel: ./bin/seatunnel.sh -DJvmOption="-Xms4G -Xmx4G" -m local -c /config/fake_to_console.conf ``` #### Build Image By Yourself Build from source code. The way of downloading the source code is the same as the way of downloading the binary package. You can download the source code from the [download page](https://seatunnel.apache.org/download/) or clone the source code from the [GitHub repository](https://github.com/apache/seatunnel/releases) ##### Build With One Command ```shell cd seatunnel # Use already sett maven profile sh ./mvnw -B clean install -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Dlicense.skipAddThirdParty=true -D"docker.build.skip"=false -D"docker.verify.skip"=false -D"docker.push.skip"=true -D"docker.tag"=3.0.0 -Dmaven.deploy.skip -D"skip.spotless"=true --no-snapshot-updates -Pdocker,seatunnel # Check the docker image docker images | grep apache/seatunnel ``` ##### Build Step By Step ```shell # Build binary package from source code sh ./mvnw clean package -DskipTests -Dskip.spotless=true # Build docker image cd seatunnel-dist docker build -f src/main/docker/Dockerfile --build-arg VERSION=3.0.0 -t apache/seatunnel:3.0.0 . # If you build from dev branch, you should add SNAPSHOT suffix to the version docker build -f src/main/docker/Dockerfile --build-arg VERSION=3.0.0-SNAPSHOT -t apache/seatunnel:3.0.0-SNAPSHOT . # Check the docker image docker images | grep apache/seatunnel ``` The Dockerfile is like this: ```dockerfile FROM openjdk:8 ARG VERSION # Build from Source Code And Copy it into image COPY ./target/apache-seatunnel-${VERSION}-bin.tar.gz /opt/ # Download From Internet # Please Note this file only include fake/console connector, You'll need to download the other connectors manually # wget -P /opt https://dlcdn.apache.org/seatunnel/${VERSION}/apache-seatunnel-${VERSION}-bin.tar.gz RUN cd /opt && \ tar -zxvf apache-seatunnel-${VERSION}-bin.tar.gz && \ mv apache-seatunnel-${VERSION} seatunnel && \ rm apache-seatunnel-${VERSION}-bin.tar.gz && \ sed -i 's/#rootLogger.appenderRef.consoleStdout.ref/rootLogger.appenderRef.consoleStdout.ref/' seatunnel/config/log4j2.properties && \ sed -i 's/#rootLogger.appenderRef.consoleStderr.ref/rootLogger.appenderRef.consoleStderr.ref/' seatunnel/config/log4j2.properties && \ sed -i 's/rootLogger.appenderRef.file.ref/#rootLogger.appenderRef.file.ref/' seatunnel/config/log4j2.properties && \ cp seatunnel/config/hazelcast-master.yaml seatunnel/config/hazelcast-worker.yaml WORKDIR /opt/seatunnel ``` ### Spark or Flink Engine #### Mount Spark/Flink library By default, Spark home is `/opt/spark`, Flink home is `/opt/flink`. If you need run with spark/flink, you can mount the related library to `/opt/spark` or `/opt/flink`. ```shell docker run \ -v :/opt/spark \ -v :/opt/flink \ ... ``` Or you can change the `SPARK_HOME`, `FLINK_HOME` environment variable in Dockerfile and re-build your and mount the spark/flink to related path. ```dockerfile FROM apache/seatunnel ENV SPARK_HOME= ... ``` ```shell docker run \ -v : \ ... ``` ### Submit job The command is different for different engines and different versions of the same engine, please choose the correct command. - Spark ```shell # spark2 docker run --rm -it apache/seatunnel bash ./bin/start-seatunnel-spark-2-connector-v2.sh -c config/v2.batch.config.template # spark3 docker run --rm -it apache/seatunnel bash ./bin/start-seatunnel-spark-3-connector-v2.sh -c config/v2.batch.config.template ``` - Flink before you submit job, you need start flink cluster first. ```shell # flink version between `1.12.x` and `1.14.x` docker run --rm -it apache/seatunnel bash -c '/bin/start-cluster.sh && ./bin/start-seatunnel-flink-13-connector-v2.sh -c config/v2.streaming.conf.template' # flink version between `1.15.x` and `1.16.x` docker run --rm -it apache/seatunnel bash -c '/bin/start-cluster.sh && ./bin/start-seatunnel-flink-15-connector-v2.sh -c config/v2.streaming.conf.template' ``` ## Set Up With Docker In Cluster Mode there has 2 ways to create cluster within docker. ### Use Docker Directly #### create a network ```shell docker network create seatunnel-network ``` #### start the nodes - start master node ```shell ## start master and export 5801 port docker run -d --name seatunnel_master \ --network seatunnel-network \ --rm \ -p 5801:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r master ``` - get created container ip ```shell docker inspect seatunnel_master ``` run this command to get the pod ip. - start worker node ```shell # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run -d --name seatunnel_worker_1 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ## start worker2 # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run -d --name seatunnel_worker_2 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ``` #### Scale your Cluster run this command to start master node. ```shell # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run -d --name seatunnel_master \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r master ``` run this command to start worker node. ```shell # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run -d --name seatunnel_worker_1 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ``` ### Use Docker-compose > docker cluster mode is only support zeta engine. The `docker-compose.yaml` file is : ```yaml version: '3.8' services: master: image: apache/seatunnel container_name: seatunnel_master environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r master " ports: - "5801:5801" networks: seatunnel_network: ipv4_address: 172.16.0.2 worker1: image: apache/seatunnel container_name: seatunnel_worker_1 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.3 worker2: image: apache/seatunnel container_name: seatunnel_worker_2 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.4 networks: seatunnel_network: driver: bridge ipam: config: - subnet: 172.16.0.0/24 ``` run `docker-compose up -d` command to start the cluster. You can run `docker logs -f seatunnel_master`, `docker logs -f seatunnel_worker_1` to check the node log. And when you call `http://localhost:5801/hazelcast/rest/maps/system-monitoring-information`, you will see there are 2 nodes as we excepted. After that, you can use client or restapi to submit job to this cluster. #### Scale your Cluster If you want to increase cluster node, like add a new work node. ```yaml version: '3.8' services: master: image: apache/seatunnel container_name: seatunnel_master environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r master " ports: - "5801:5801" networks: seatunnel_network: ipv4_address: 172.16.0.2 worker1: image: apache/seatunnel container_name: seatunnel_worker_1 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.3 worker2: image: apache/seatunnel container_name: seatunnel_worker_2 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.4 #### ## add new worker node #### worker3: image: apache/seatunnel container_name: seatunnel_worker_3 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4,172.16.0.5 # add ip to here entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.5 # use a not used ip networks: seatunnel_network: driver: bridge ipam: config: - subnet: 172.16.0.0/24 ``` and run `docker-compose up -d` command, the new worker node will start, and the current node won't restart. ### Job Operation on cluster #### use docker as a client - submit job : ```shell # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run --name seatunnel_client \ --network seatunnel-network \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ --rm \ apache/seatunnel \ ./bin/seatunnel.sh -c config/v2.batch.config.template ``` - list job ```shell # you need update yourself master container ip to `ST_DOCKER_MEMBER_LIST` docker run --name seatunnel_client \ --network seatunnel-network \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ --rm \ apache/seatunnel \ ./bin/seatunnel.sh -l ``` more command please refer [user-command](../../engines/zeta/user-command.md) #### use rest api please refer [Submit A Job](../../engines/zeta/rest-api-v2.md#submit-a-job) ================================================ FILE: docs/en/getting-started/kubernetes/helm.md ================================================ --- sidebar_position: 4 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # Set Up with Helm This section provides a quick guide to use SeaTunnel with Helm. ## Prerequisites We assume that you have one local installation as follow: - [docker](https://docs.docker.com/) - [kubernetes](https://kubernetes.io/) - [helm](https://helm.sh/docs/intro/quickstart/) So that the `kubectl` and `helm` commands are available on your local system. Take kubernetes [minikube](https://minikube.sigs.k8s.io/docs/start/) as an example, you can start a cluster with the following command: ```bash minikube start --kubernetes-version=v1.23.3 ``` ## Install Install with default settings. ```bash # Choose the corresponding version yourself export VERSION=2.3.10 helm pull oci://registry-1.docker.io/apache/seatunnel-helm --version ${VERSION} tar -xvf seatunnel-helm-${VERSION}.tgz cd seatunnel-helm helm install seatunnel . ``` Install with another namespace. ```bash helm install seatunnel . -n ``` ## Submit Job The default config doesn't enable ingress, so you need forward the master restapi. ```bash kubectl port-forward -n default svc/seatunnel-master 5801:5801 ``` Then you can access restapi with "http://127.0.0.1/5801/" If you want to use ingress, update `value.yaml` for example: ```commandline ingress: enabled: true host: "" ``` Then upgrade seatunnel. Then you can access restapi with `http://` Or you can just go into master pod, and use local curl command. ```commandline # get one of the master pods MASTER_POD=$(kubectl get po -l 'app.kubernetes.io/name=seatunnel-master' | sed '1d' | awk '{print $1}' | head -n1) # go into master pod container. kubectl -n default exec -it $MASTER_POD -- /bin/bash curl http://127.0.0.1:5801/running-jobs curl http://127.0.0.1:5801/system-monitoring-information ``` After that you can submit your job by [rest-api-v2](../../engines/zeta/rest-api-v2.md) ## What's More For now, you have taken a quick look at SeaTunnel, and you can see [connector](../../connectors/source) to find all sources and sinks SeaTunnel supported. Or see [deployment](../../engines/zeta/deployment.md) if you want to submit your application in another kind of your engine cluster. ================================================ FILE: docs/en/getting-started/kubernetes/kubernetes.mdx ================================================ --- sidebar_position: 4 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # Set Up with Kubernetes This section provides a quick guide to use SeaTunnel with Kubernetes. ## Prerequisites We assume that you have one local installation as follow: - [docker](https://docs.docker.com/) - [kubernetes](https://kubernetes.io/) - [helm](https://helm.sh/docs/intro/quickstart/) So that the `kubectl` and `helm` commands are available on your local system. Take kubernetes [minikube](https://minikube.sigs.k8s.io/docs/start/) as an example, you can start a cluster with the following command: ```bash minikube start --kubernetes-version=v1.23.3 ``` ## Installation ### SeaTunnel Docker Image To run the image with SeaTunnel, first create a `Dockerfile`: ```Dockerfile FROM flink:1.13 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` Then run the following commands to build the image: ```bash docker build -t seatunnel:3.0.0-flink-1.13 -f Dockerfile . ``` Image `seatunnel:3.0.0-flink-1.13` needs to be present in the host (minikube) so that the deployment can take place. Load image to minikube via: ```bash minikube image load seatunnel:3.0.0-flink-1.13 ``` ```Dockerfile FROM openjdk:8 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` Then run the following commands to build the image: ```bash docker build -t seatunnel:3.0.0 -f Dockerfile . ``` Image `seatunnel:3.0.0` need to be present in the host (minikube) so that the deployment can take place. Load image to minikube via: ```bash minikube image load seatunnel:3.0.0 ``` ```Dockerfile FROM openjdk:8 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN mkdir -p $SEATUNNEL_HOME/logs RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` Then run the following commands to build the image: ```bash docker build -t seatunnel:3.0.0 -f Dockerfile . ``` Image `seatunnel:3.0.0` needs to be present in the host (minikube) so that the deployment can take place. Load image to minikube via: ```bash minikube image load seatunnel:3.0.0 ``` ### Deploying The Operator The steps below provide a quick walk-through on setting up the Flink Kubernetes Operator. You can refer to [Flink Kubernetes Operator - Quick Start](https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-main/docs/try-flink-kubernetes-operator/quick-start/) for more details. > Notice: All the Kubernetes resources bellow are created in default namespace. Install the certificate manager on your Kubernetes cluster to enable adding the webhook component (only needed once per Kubernetes cluster): ```bash kubectl create -f https://github.com/jetstack/cert-manager/releases/download/v1.8.2/cert-manager.yaml ``` Now you can deploy the latest stable Flink Kubernetes Operator version using the included Helm chart: ```bash helm repo add flink-operator-repo https://downloads.apache.org/flink/flink-kubernetes-operator-1.3.1/ helm install flink-kubernetes-operator flink-operator-repo/flink-kubernetes-operator \ --set image.repository=apache/flink-kubernetes-operator ``` You may verify your installation via `kubectl`: ```bash kubectl get pods NAME READY STATUS RESTARTS AGE flink-kubernetes-operator-5f466b8549-mgchb 1/1 Running 3 (23h ago) 16d ``` none none ## Run SeaTunnel Application **Run Application:**: SeaTunnel already providers out-of-the-box [configurations](https://github.com/apache/seatunnel/tree/dev/config). In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { plugin_output = "fake" row.num = 160000 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` Once the Flink Kubernetes Operator is running as seen in the previous steps you are ready to submit a Flink (SeaTunnel) job: - Create `seatunnel-flink.yaml` FlinkDeployment manifest: ```yaml apiVersion: flink.apache.org/v1beta1 kind: FlinkDeployment metadata: name: seatunnel-flink-streaming-example spec: image: seatunnel:3.0.0-flink-1.13 flinkVersion: v1_13 flinkConfiguration: taskmanager.numberOfTaskSlots: "2" serviceAccount: flink jobManager: replicas: 1 resource: memory: "1024m" cpu: 1 taskManager: resource: memory: "1024m" cpu: 1 podTemplate: spec: containers: - name: flink-main-container volumeMounts: - name: seatunnel-config mountPath: /data/seatunnel.streaming.conf subPath: seatunnel.streaming.conf volumes: - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf job: jarURI: local:///opt/seatunnel/starter/seatunnel-flink-13-starter.jar entryClass: org.apache.seatunnel.core.starter.flink.SeaTunnelFlink args: ["--config", "/data/seatunnel.streaming.conf"] parallelism: 2 upgradeMode: stateless ``` - Run the example application: ```bash kubectl apply -f seatunnel-flink.yaml ``` In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Console { } } ``` Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` - Create `seatunnel.yaml`: ```yaml apiVersion: v1 kind: Pod metadata: name: seatunnel spec: containers: - name: seatunnel image: seatunnel:3.0.0 command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf -e local"] resources: limits: cpu: "1" memory: 4G requests: cpu: "1" memory: 2G volumeMounts: - name: seatunnel-config mountPath: /data/seatunnel.streaming.conf subPath: seatunnel.streaming.conf volumes: - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf ``` - Run the example application: ```bash kubectl apply -f seatunnel.yaml ``` In this guide we will use [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Console { } } ``` Generate a configmap named seatunnel-config in Kubernetes for the seatunnel.streaming.conf so that we can mount the config content in pod. ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` Then, we use the following command to load some configuration files used by the seatunnel cluster into the configmap Create the yaml file locally as follows - Create `hazelcast-client.yaml`: ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - localhost:5801 ``` - Create `hazelcast.yaml`: ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 ``` - Create `seatunnel.yaml`: ```yaml seatunnel: engine: history-job-expire-minutes: 1440 backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 print-job-metrics-info-interval: 60 slot-service: dynamic-slot: true checkpoint: interval: 10000 timeout: 60000 storage: type: hdfs max-retained: 3 plugin-config: namespace: /tmp/seatunnel/checkpoint_snapshot storage.type: hdfs fs.defaultFS: file:///tmp/ # Ensure that the directory has written permission ``` Create congfigmaps for the configuration file using the following command ```bash kubectl create configmap hazelcast-client --from-file=hazelcast-client.yaml kubectl create configmap hazelcast --from-file=hazelcast.yaml kubectl create configmap seatunnelmap --from-file=seatunnel.yaml ``` Deploy Reloader to achieve hot deployment We use the Reloader here to automatically restart the pod when the configuration file or other modifications are made. You can also directly give the value of the configuration file and do not use the Reloader - [Reloader](https://github.com/stakater/Reloader/) ```bash wget https://raw.githubusercontent.com/stakater/Reloader/master/deployments/kubernetes/reloader.yaml kubectl apply -f reloader.yaml ``` - Create `seatunnel-cluster.yml`: ```yaml apiVersion: v1 kind: Service metadata: name: seatunnel spec: selector: app: seatunnel ports: - port: 5801 name: seatunnel clusterIP: None --- apiVersion: apps/v1 kind: StatefulSet metadata: name: seatunnel annotations: configmap.reloader.stakater.com/reload: "hazelcast,hazelcast-client,seatunnelmap" spec: serviceName: "seatunnel" replicas: 3 # modify replicas according to your case selector: matchLabels: app: seatunnel template: metadata: labels: app: seatunnel spec: containers: - name: seatunnel image: seatunnel:3.0.0 imagePullPolicy: IfNotPresent ports: - containerPort: 5801 name: client command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel-cluster.sh -DJvmOption=-Xms2G -Xmx2G"] resources: limits: cpu: "1" memory: 4G requests: cpu: "1" memory: 2G volumeMounts: - mountPath: "/opt/seatunnel/config/hazelcast.yaml" name: hazelcast subPath: hazelcast.yaml - mountPath: "/opt/seatunnel/config/hazelcast-client.yaml" name: hazelcast-client subPath: hazelcast-client.yaml - mountPath: "/opt/seatunnel/config/seatunnel.yaml" name: seatunnelmap subPath: seatunnel.yaml - mountPath: /data/seatunnel.streaming.conf name: seatunnel-config subPath: seatunnel.streaming.conf volumes: - name: hazelcast configMap: name: hazelcast - name: hazelcast-client configMap: name: hazelcast-client - name: seatunnelmap configMap: name: seatunnelmap - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf ``` - Starting a cluster: ```bash kubectl apply -f seatunnel-cluster.yml ``` Then modify the seatunnel configuration in pod using the following command: ```bash kubectl edit cm hazelcast ``` Change the member-list option to your cluster address This uses the headless service access mode The format for accessing between general pods is [pod-name].[service-name].[namespace].svc.cluster.local for example: ```bash - seatunnel-0.seatunnel.default.svc.cluster.local - seatunnel-1.seatunnel.default.svc.cluster.local - seatunnel-2.seatunnel.default.svc.cluster.local ``` ```bash kubectl edit cm hazelcast-client ``` Change the cluster-members option to your cluster address for example: ```bash - seatunnel-0.seatunnel.default.svc.cluster.local:5801 - seatunnel-1.seatunnel.default.svc.cluster.local:5801 - seatunnel-2.seatunnel.default.svc.cluster.local:5801 ``` Later, you will see that the pod automatically restarts and updates the seatunnel configuration ```bash kubectl edit cm hazelcast-client ``` After we wait for all pod updates to be completed, we can use the following command to check if the configuration inside the pod has been updated ```bash kubectl exec -it seatunnel-0 -- cat /opt/seatunnel/config/hazelcast-client.yaml ``` Afterwards, we can submit tasks to any pod ```bash kubectl exec -it seatunnel-0 -- /opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf ``` **See The Output** You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: ```bash kubectl logs -f deploy/seatunnel-flink-streaming-example ``` looks like the below: ```shell ... 2023-01-31 12:13:54,349 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from SCHEDULED to DEPLOYING. 2023-01-31 12:13:56,684 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Deploying Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (attempt #0) with attempt id 1665d2d011b2f6cf6525c0e5e75ec251 to seatunnel-flink-streaming-example-taskmanager-1-1 @ 100.103.244.106 (dataPort=39137) with allocation id fbe162650c4126649afcdaff00e46875 2023-01-31 12:13:57,794 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from DEPLOYING to INITIALIZING. 2023-01-31 12:13:58,203 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from INITIALIZING to RUNNING. ``` If OOM error accur in the log, you can decrease the `row.num` value in seatunnel.streaming.conf To expose the Flink Dashboard you may add a port-forward rule: ```bash kubectl port-forward svc/seatunnel-flink-streaming-example-rest 8081 ``` Now the Flink Dashboard is accessible at [localhost:8081](http://localhost:8081). Or launch `minikube dashboard` for a web-based Kubernetes user interface. The content printed in the TaskManager Stdout log: ```bash kubectl logs \ -l 'app in (seatunnel-flink-streaming-example), component in (taskmanager)' \ --tail=-1 \ -f ``` looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): ```shell ... subtaskIndex=0: row=159991 : VVgpp, 978840000 subtaskIndex=0: row=159992 : JxrOC, 1493825495 subtaskIndex=0: row=159993 : YmCZR, 654146216 subtaskIndex=0: row=159994 : LdmUn, 643140261 subtaskIndex=0: row=159995 : tURkE, 837012821 subtaskIndex=0: row=159996 : uPDfd, 2021489045 subtaskIndex=0: row=159997 : mjrdG, 2074957853 subtaskIndex=0: row=159998 : xbeUi, 864518418 subtaskIndex=0: row=159999 : sSWLb, 1924451911 subtaskIndex=0: row=160000 : AuPlM, 1255017876 ``` To stop your job and delete your FlinkDeployment you can simply: ```bash kubectl delete -f seatunnel-flink.yaml ``` You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: ```bash kubectl logs -f seatunnel ``` looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): ```shell ... 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25673: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : hRJdE, 1295862507 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25674: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kXlew, 935460726 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25675: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : FrNOT, 1714358118 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25676: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kSajX, 126709414 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25677: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YhpQv, 2020198351 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25678: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : nApin, 691339553 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25679: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : KZNNa, 1720773736 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25680: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : uCUBI, 490868386 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25681: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : oTLmO, 98770781 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25682: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : UECud, 835494636 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25683: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : XNegY, 1602828896 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25684: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LcFBx, 1400869177 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25685: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : EqSfF, 1933614060 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25686: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : BODIs, 1839533801 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25687: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : doxcI, 970104616 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25688: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IEVYn, 371893767 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25689: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YXYfq, 1719257882 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25690: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LFWEm, 725033360 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25691: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : ypUrY, 1591744616 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25692: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rlnzJ, 412162913 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25693: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : zWKnt, 976816261 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25694: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : PXrsk, 43554541 ``` To stop your job and delete your FlinkDeployment you can simply: ```bash kubectl delete -f seatunnel.yaml ``` You may follow the logs of your job, after a successful startup (which can take on the order of a minute in a fresh environment, seconds afterwards) you can: ```bash kubectl exec -it seatunnel-1 -- tail -f /opt/seatunnel/logs/seatunnel-engine-server.log | grep ConsoleSinkWriter ``` looks like the below (your content may be different since we use `FakeSource` to automatically generate random stream data): ```shell ... 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=7: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IibHk, 820962465 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=8: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lmKdb, 1072498088 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=9: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : iqGva, 918730371 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=10: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : JMHmq, 1130771733 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=11: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rxoHF, 189596686 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=12: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : OSblw, 559472064 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=13: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : yTZjG, 1842482272 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=14: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : RRiMg, 1713777214 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=15: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lRcsd, 1626041649 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=16: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : QrNNW, 41355294 ``` To stop your job and delete your FlinkDeployment you can simply: ```bash kubectl delete -f seatunnel-cluster.yaml ``` Happy SeaTunneling! ## What's More For now, you have taken a quick look at SeaTunnel, and you can see [connector](../../connector-v2/source) to find all sources and sinks SeaTunnel supported. Or see [deployment](../deployment.mdx) if you want to submit your application in another kind of your engine cluster. ================================================ FILE: docs/en/getting-started/locally/deployment.md ================================================ --- sidebar_position: 2 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # Deployment ## Preparation Before starting to download SeaTunnel, you need to ensure that you have installed the following software required by SeaTunnel: * Install [Java](https://www.java.com/en/download/) (Java 8 or 11, and other versions higher than Java 8 can theoretically work) and set `JAVA_HOME`. ## Download SeaTunnel Release Package ### Download The Binary Package Visit the [SeaTunnel Download Page](https://seatunnel.apache.org/download) to download the latest binary package `seatunnel--bin.tar.gz`. Or you can also download it through the terminal: ```shell export version="3.0.0" wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` ### Download The Connector Plugins Starting from version 2.2.0-beta, the binary package no longer provides connector dependencies by default. Therefore, the first time you use it, you need to run the following command to install the connectors (Alternatively, you can manually download the connectors from the [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) and move them to the `connectors/` directory. For versions before 2.3.5, place them in the `connectors/seatunnel` directory): ```bash sh bin/install-plugin.sh ``` If you need a specific connector version, taking 3.0.0 as an example, you need to execute the following command: ```bash sh bin/install-plugin.sh 3.0.0 ``` Typically, you do not need all the connector plugins. You can specify the required plugins by configuring `config/plugin_config`. For example, if you want the sample application to work properly, you will need the `connector-console` and `connector-fake` plugins. You can modify the `plugin_config` configuration file as follows: ```plugin_config --seatunnel-connectors-- connector-fake connector-console --end-- ``` You can find all supported connectors and the corresponding plugin_config configuration names under `${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`. :::tip Tip If you want to install connector plugins by manually downloading connectors, you only need to download the related connector plugins and place them in the `${SEATUNNEL_HOME}/connectors/` directory. ::: ## Build SeaTunnel From Source Code ### Download The Source Code Build from source code. The way of downloading the source code is the same as the way of downloading the binary package. You can download the source code from the [download page](https://seatunnel.apache.org/download/) or clone the source code from the [GitHub repository](https://github.com/apache/seatunnel/releases) ### Build The Source Code ```shell cd seatunnel sh ./mvnw clean install -DskipTests -Dskip.spotless=true # get the binary package cp seatunnel-dist/target/apache-seatunnel-3.0.0-bin.tar.gz /The-Path-You-Want-To-Copy cd /The-Path-You-Want-To-Copy tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` When built from the source code, all the connector plugins and some necessary dependencies (eg: mysql driver) are included in the binary package. You can directly use the connector plugins without the need to install them separately. # Run SeaTunnel Now you have downloaded the SeaTunnel binary package and the connector plugins. Next, you can choose different engine option to run synchronization tasks. If you use Flink to run the synchronization task, there is no need to deploy the SeaTunnel Engine service cluster. You can refer to [Quick Start With Flink](quick-start-flink.md) to run your synchronization task. If you use Spark to run the synchronization task, there is no need to deploy the SeaTunnel Engine service cluster. You can refer to [Quick Start With Spark](quick-start-spark.md) to run your synchronization task. If you use the builtin SeaTunnel Engine (Zeta) to run tasks, you need to deploy the SeaTunnel Engine service first. Refer to [Quick Start With SeaTunnel Engine](quick-start-seatunnel-engine.md). ================================================ FILE: docs/en/getting-started/locally/quick-start-flink.md ================================================ --- sidebar_position: 3 --- # Quick Start With Flink ## Step 1: Deploy SeaTunnel And Connectors Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) ## Step 2: Deployment And Config Flink Please [Download Flink](https://flink.apache.org/downloads.html) first(**required version >= 1.12.0**). For more information you can see [Getting Started: Standalone](https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/deployment/resource-providers/standalone/overview/) **Configure SeaTunnel**: Change the setting in `${SEATUNNEL_HOME}/config/seatunnel-env.sh` and set `FLINK_HOME` to the Flink deployment dir. ## Step 3: Add Job Config File To Define A Job Edit `config/v2.streaming.conf.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. The following is an example of the configuration file, which is the same as the example application mentioned above. ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` More information about config please check [Config Concept](../../introduction/concepts/config.md) ## Step 4: Run SeaTunnel Application You can start the application by the following commands: Flink version between `1.12.x` and `1.14.x` ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template ``` Flink version between `1.15.x` and `1.18.x` ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template ``` **See The Output**: When you run the command, you can see its output in your console. This is a sign to determine whether the command ran successfully or not. The SeaTunnel console will print some logs as below: ```shell fields : name, age types : STRING, INT row=1 : elWaB, 1984352560 row=2 : uAtnp, 762961563 row=3 : TQEIB, 2042675010 row=4 : DcFjo, 593971283 row=5 : SenEb, 2099913608 row=6 : DHjkg, 1928005856 row=7 : eScCM, 526029657 row=8 : sgOeE, 600878991 row=9 : gwdvw, 1951126920 row=10 : nSiKE, 488708928 row=11 : xubpl, 1420202810 row=12 : rHZqb, 331185742 row=13 : rciGD, 1112878259 row=14 : qLhdI, 1457046294 row=15 : ZTkRx, 1240668386 row=16 : SGZCr, 94186144 ``` ## What's More - Start write your own config file now, choose the [connector](../../connectors/source) you want to use, and configure the parameters according to the connector's documentation. - See [SeaTunnel With Flink](../../engines/flink.md) if you want to know more about SeaTunnel With Flink. - SeaTunnel have a builtin engine named `Zeta`, and it's the default engine of SeaTunnel. You can follow [Quick Start](quick-start-seatunnel-engine.md) to configure and run a data synchronization job. ================================================ FILE: docs/en/getting-started/locally/quick-start-seatunnel-engine.md ================================================ --- sidebar_position: 2 --- # Quick Start With SeaTunnel Engine ## Step 1: Deploy SeaTunnel And Connectors Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) ## Step 2: Add Job Config File To Define A Job Edit `config/v2.batch.config.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. The following is an example of the configuration file, which is the same as the example application mentioned above. ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` More information can be found in [Config Concept](../../introduction/concepts/config.md) ## Step 3: Run SeaTunnel Application You could start the application by the following commands: :::tip Starting from version 2.3.1, the parameter -e in seatunnel.sh is deprecated, use -m instead. ::: ```shell cd "apache-seatunnel-${version}" ./bin/seatunnel.sh --config ./config/v2.batch.config.template -m local ``` **See The Output**: When you run the command, you can see its output in your console. This is a sign to determine whether the command ran successfully or not. The SeaTunnel console will print some logs as below: ```shell 2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age 2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=11: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: GSvzm, 827085798 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=12: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: NNAYI, 94307133 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=13: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: EexFl, 1823689599 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=14: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CBXUb, 869582787 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=15: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: Wbxtm, 1469371353 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=16: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: mIJDt, 995616438 ``` ## Extended Example: Batch Mode from MySQL to Doris ### Step 1: Download the Connector First, you need to add the connector name to the `${SEATUNNEL_HOME}/config/plugin_config` file. Then, execute the command to install the connector (of course, you can also manually download the connector from the [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) and move it to the `connectors/` directory). Finally, make sure that the `connector-jdbc` and `connector-doris` connectors are in the `${SEATUNNEL_HOME}/connectors/` directory. ```bash # Configure the connector name. --seatunnel-connectors-- connector-jdbc connector-doris --end-- ``` ```bash # Install the connector. sh bin/install-plugin.sh ``` ### Step 2: Place the MySQL Driver You need to download the [JDBC driver JAR package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) and place it in the `${SEATUNNEL_HOME}/lib/` directory. ### Step 3: Add Job Configuration File to Define the Job ```bash cd seatunnel/job/ vim st.conf env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "user" password = "pwd" table_path = "test.table_name" query = "select * from test.table_name" } } sink { Doris { fenodes = "doris_ip:8030" username = "user" password = "pwd" database = "test_db" table = "table_name" sink.enable-2pc = "true" sink.label-prefix = "test-cdc" doris.config = { format = "json" read_json_by_line="true" } } } ``` For more information about the configuration, please refer to [Basic Concepts of Configuration](../../introduction/concepts/config.md). ### Step 4: Run the SeaTunnel Application You can start the application using the following command: ```shell cd seatunnel/ ./bin/seatunnel.sh --config ./job/st.conf -m local ``` **Check the Output**: When you run the command, you can see its output in the console. You can consider this as an indicator of whether the command has succeeded or failed. The SeaTunnel console will print some log information like the following: ```shell *********************************************** Job Statistic Information *********************************************** Start Time : 2024-08-13 10:21:49 End Time : 2024-08-13 10:21:53 Total Time(s) : 4 Total Read Count : 1000 Total Write Count : 1000 Total Failed Count : 0 *********************************************** ``` :::tip If you want to optimize your job, refer to the connector documentation for [Source-MySQL](../../connectors/source/Mysql.md) and [Sink-Doris](../../connectors/sink/Doris.md). ::: ## What's More - Start write your own config file now, choose the [connector](../../connectors/source) you want to use, and configure the parameters according to the connector's documentation. - See [SeaTunnel Engine(Zeta)](../../engines/zeta/about.md) if you want to know more about SeaTunnel Engine. Here you will learn how to deploy SeaTunnel Engine and how to use it in cluster mode. ================================================ FILE: docs/en/getting-started/locally/quick-start-spark.md ================================================ --- sidebar_position: 4 --- # Quick Start With Spark ## Step 1: Deployment SeaTunnel And Connectors Before starting, make sure you have downloaded and deployed SeaTunnel as described in [Deployment](deployment.md) ## Step 2: Deploy And Config Spark Please [Download Spark](https://spark.apache.org/downloads.html) first(**required version >= 2.4.0**). For more information you can see [Getting Started: Standalone](https://spark.apache.org/docs/latest/spark-standalone.html#installing-spark-standalone-to-a-cluster) **Configure SeaTunnel**: Change the setting in `${SEATUNNEL_HOME}/config/seatunnel-env.sh` and set `SPARK_HOME` to the Spark deployment dir. ## Step 3: Add Job Config File To Define A Job Edit `config/seatunnel.streaming.conf.template`, which determines the way and logic of data input, processing, and output after seatunnel is started. The following is an example of the configuration file, which is the same as the example application mentioned above. ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` More information about config please check [Config Concept](../../introduction/concepts/config.md) ## Step 4: Run SeaTunnel Application You could start the application by the following commands: Spark 2.4.x ```bash cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-2-connector-v2.sh \ --master local[4] \ --deploy-mode client \ --config ./config/v2.streaming.conf.template ``` Spark3.x.x ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-3-connector-v2.sh \ --master local[4] \ --deploy-mode client \ --config ./config/v2.streaming.conf.template ``` **See The Output**: When you run the command, you can see its output in your console. This is a sign to determine whether the command ran successfully or not. The SeaTunnel console will print some logs as below: ```shell fields : name, age types : STRING, INT row=1 : elWaB, 1984352560 row=2 : uAtnp, 762961563 row=3 : TQEIB, 2042675010 row=4 : DcFjo, 593971283 row=5 : SenEb, 2099913608 row=6 : DHjkg, 1928005856 row=7 : eScCM, 526029657 row=8 : sgOeE, 600878991 row=9 : gwdvw, 1951126920 row=10 : nSiKE, 488708928 row=11 : xubpl, 1420202810 row=12 : rHZqb, 331185742 row=13 : rciGD, 1112878259 row=14 : qLhdI, 1457046294 row=15 : ZTkRx, 1240668386 row=16 : SGZCr, 94186144 ``` ## What's More - Start write your own config file now, choose the [connector](../../connectors/source) you want to use, and configure the parameters according to the connector's documentation. - See [SeaTunnel With Spark](../../engines/spark.md) if you want to know more about SeaTunnel With Spark. - SeaTunnel have a builtin engine named `Zeta`, and it's the default engine of SeaTunnel. You can follow [Quick Start](quick-start-seatunnel-engine.md) to configure and run a data synchronization job. ================================================ FILE: docs/en/introduction/about.md ================================================ # About SeaTunnel seatunnel logo [![Slack](../../images/seatunnel-slack.svg)](https://s.apache.org/seatunnel-slack) [![Twitter Follow](../../images/ASFSeaTunnel.svg)](https://x.com/ASFSeaTunnel) SeaTunnel is a multimodal, ultra-high-performance, distributed data integration tool, capable of synchronizing vast amounts of data daily. It's trusted by numerous companies for its efficiency and stability. ## Why We Need SeaTunnel SeaTunnel focuses on data integration and data synchronization, and is mainly designed to solve common problems in the field of data integration: * **Various data sources**: There are hundreds of commonly-used data sources with incompatible versions. With the emergence of new technologies, more data sources are appearing. It is difficult for users to find a tool that can fully and quickly support these data sources. * **Multimodal data integration**: In addition to structured data, users also need to integrate video, images, binary files, structured and unstructured text data. However, existing data integration tools are mainly focused on structured data. * **Complex synchronization scenarios**: Data synchronization needs to support various synchronization scenarios such as offline-full synchronization, offline-incremental synchronization, CDC, real-time synchronization, and full database synchronization. * **High resource demand**: Existing data integration and data synchronization tools often require vast computing resources or JDBC connection resources to complete real-time synchronization of massive small tables. This has increased the burden on enterprises. * **Lack of quality and monitoring**: Data integration and synchronization processes often experience loss or duplication of data. The synchronization process lacks monitoring, and it is impossible to intuitively understand the real situation of the data during the task process. * **Complex technology stack**: The technology components used by enterprises are different, and users need to develop corresponding synchronization programs for different components to complete data integration. * **Difficulty in management and maintenance**: Limited to different underlying technology components (Flink/Spark), offline synchronization and real-time synchronization often have be developed and managed separately, which increases the difficulty of management and maintenance. ## Features Of SeaTunnel * **Rich and extensible Connector**: SeaTunnel provides a Connector API that does not depend on a specific execution engine. Connectors (Source, Transform, Sink) developed based on this API can run on many different engines, such as SeaTunnel Engine(Zeta), Flink, and Spark. * **Connector plugin**: The plugin design allows users to easily develop their own Connector and integrate it into the SeaTunnel project. Currently, SeaTunnel supports more than 100 Connectors, and the number is surging. * **Batch-stream integration**: Connectors developed based on the SeaTunnel Connector API are perfectly compatible with offline synchronization, real-time synchronization, full-synchronization, incremental synchronization and other scenarios. They greatly reduce the difficulty of managing data integration tasks. * **Distributed snapshot**: Supports a distributed snapshot algorithm to ensure data consistency. * **Multi-engine support**: SeaTunnel uses the SeaTunnel Engine(Zeta) for data synchronization by default. SeaTunnel also supports the use of Flink or Spark as the execution engine of the Connector to adapt to the enterprise's existing technical components. SeaTunnel supports multiple versions of Spark and Flink. * **JDBC multiplexing, database log multi-table parsing**: SeaTunnel supports multi-table or whole database synchronization, which solves the problem of over-JDBC connections; and supports multi-table or whole database log reading and parsing, which solves the need for CDC multi-table synchronization scenarios to deal with problems with repeated reading and parsing of logs. * **High throughput and low latency**: SeaTunnel supports parallel reading and writing, providing stable and reliable data synchronization capabilities with high throughput and low latency. * **Perfect real-time monitoring**: SeaTunnel supports detailed monitoring information of each step in the data synchronization process, allowing users to easily understand the number of data, data size, QPS and other information read and written by the synchronization task. * **Two job development methods are supported**: coding and canvas design. The SeaTunnel web project https://github.com/apache/seatunnel-web provides visual management of jobs, scheduling, running and monitoring capabilities. ## SeaTunnel Work Flowchart ![SeaTunnel Work Flowchart](../../images/architecture_diagram.png) The runtime process of SeaTunnel is shown in the figure above. The user configures the job information and selects the execution engine to submit the job. The Source Connector is responsible for parallel reading and sending the data to the downstream Transform or directly to the Sink, and the Sink writes the data to the destination. It is worth noting that Source, Transform and Sink can be easily developed and extended by yourself. SeaTunnel is an EtL(T) data integration tool. Therefore, in SeaTunnel, transform can only be used to perform some simple transformations on data, such as converting the data of a column to uppercase or lowercase, changing the column name, or splitting a column into multiple columns. The default engine use by SeaTunnel is [SeaTunnel Engine](../engines/zeta/about.md). If you choose to use the Flink or Spark engine, SeaTunnel will package the Connector into a Flink or Spark program and submit it to Flink or Spark to run. ## Connector - **Source Connectors** SeaTunnel supports reading data from various relational, graph, NoSQL, document, and memory databases; distributed file systems such as HDFS; and a variety of cloud storage solutions, such as S3 and OSS. We also support data reading of many common SaaS services. You can access the detailed list [Here](../connectors/source). If you want, You can develop your own source connector and easily integrate it into SeaTunnel. - **Transform Connector** If the schema is different between source and Sink, You can use the Transform Connector to change the schema read from source and make it the same as the Sink schema. - **Sink Connector** SeaTunnel supports writing data to various relational, graph, NoSQL, document, and memory databases; distributed file systems such as HDFS; and a variety of cloud storage solutions, such as S3 and OSS. We also support writing data to many common SaaS services. You can access the detailed list [Here](../connectors/sink). If you want, you can develop your own Sink connector and easily integrate it into SeaTunnel. ## Who Uses SeaTunnel SeaTunnel has lots of users. You can find more information about them in [Users](https://seatunnel.apache.org/user). ## Landscapes



      

    SeaTunnel enriches the CNCF CLOUD NATIVE Landscape.

    ## Learn more You can see [Quick Start](../getting-started/locally/deployment.md) for the next steps. ================================================ FILE: docs/en/introduction/concepts/config.md ================================================ # Intro To Config File In SeaTunnel, the most important thing is the config file, through which users can customize their own data synchronization requirements to maximize the potential of SeaTunnel. So next, I will introduce you how to configure the config file. The main format of the config file is `hocon`, for more details you can refer to [HOCON-GUIDE](https://github.com/lightbend/config/blob/main/HOCON.md), BTW, we also support the `json` format, but you should keep in mind that the name of the config file should end with `.json`. We also support the `SQL` format, please refer to [SQL configuration](../configuration/sql-config.md) for more details. ## Example Before you read on, you can find config file examples [Here](https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-1/src/test/resources) from the binary package's config directory. ## Config File Structure The config file is similar to the below one: :::caution warn The old configuration name `source_table_name`/`result_table_name` is deprecated, please migrate to the new name `plugin_input`/`plugin_output` as soon as possible. ::: ### hocon ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } transform { Filter { plugin_input = "fake" plugin_output = "fake1" fields = [name, card] } } sink { Clickhouse { host = "clickhouse:8123" database = "default" table = "seatunnel_console" fields = ["name", "card"] username = "default" password = "" plugin_input = "fake1" } } ``` As you can see, the config file contains several sections: env, source, transform, sink. Different modules have different functions. After you understand these modules, you will see how SeaTunnel works. ### env Used to add some engine optional parameters, no matter which engine (Zeta, Spark or Flink), the corresponding optional parameters should be filled in here. Note that we have separated the parameters by engine, and for the common parameters, we can configure them as before. For flink and spark engine, the specific configuration rules of their parameters can be referred to [JobEnvConfig](../configuration/JobEnvConfig.md). ### source Source is used to define where SeaTunnel needs to fetch data, and use the fetched data for the next step. Multiple sources can be defined at the same time. The supported source can be found in [Source of SeaTunnel](../connectors/source). Each source has its own specific parameters to define how to fetch data, and SeaTunnel also extracts the parameters that each source will use, such as the `plugin_output` parameter, which is used to specify the name of the data generated by the current source, which is convenient for follow-up used by other modules. ### transform When we have the data source, we may need to further process the data, so we have the transform module. Of course, this uses the word 'may', which means that we can also directly treat the transform as non-existent, directly from source to sink. Like below. ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } sink { Clickhouse { host = "clickhouse:8123" database = "default" table = "seatunnel_console" fields = ["name", "age", "card"] username = "default" password = "" plugin_input = "fake" } } ``` Like source, transform has specific parameters that belong to each module. The supported transform can be found in [Transform V2 of SeaTunnel](../transform-v2) ### sink Our purpose with SeaTunnel is to synchronize data from one place to another, so it is critical to define how and where data is written. With the sink module provided by SeaTunnel, you can complete this operation quickly and efficiently. Sink and source are very similar, but the difference is reading and writing. So please check out [Supported Sinks](../connectors/sink). ### Other Information You will find that when multiple sources and multiple sinks are defined, which data is read by each sink, and which is the data read by each transform? We introduce two key configurations called `plugin_output` and `plugin_input`. Each source module will be configured with a `plugin_output` to indicate the name of the data source generated by the data source, and other transform and sink modules can use `plugin_input` to refer to the corresponding data source name, indicating that I want to read the data for processing. Then transform, as an intermediate processing module, can use both `plugin_output` and `plugin_input` configurations at the same time. But you will find that in the above example config, not every module is configured with these two parameters, because in SeaTunnel, there is a default convention, if these two parameters are not configured, then the generated data from the last module of the previous node will be used. This is much more convenient when there is only one source. ## Multi-line Support In `hocon`, multiline strings are supported, which allows you to include extended passages of text without worrying about newline characters or special formatting. This is achieved by enclosing the text within triple quotes **`"""`** . For example: ``` var = """ Apache SeaTunnel is a next-generation high-performance, distributed, massive data integration tool. """ sql = """ select * from "table" """ ``` ## Json Format Support Before writing the config file, please make sure that the name of the config file should end with `.json`. ```json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ { "plugin_name": "Filter", "plugin_input": "fake", "plugin_output": "fake1", "fields": ["name", "card"] } ], "sink": [ { "plugin_name": "Clickhouse", "host": "clickhouse:8123", "database": "default", "table": "seatunnel_console", "fields": ["name", "card"], "username": "default", "password": "", "plugin_input": "fake1" } ] } ``` ## Config Variable Substitution In a config file, we can define variables and replace them at runtime. However, note that only HOCON format files are supported. ### Usage of Variables: - `${varName}`: If the variable is not provided, an exception will be thrown. - `${varName:default}`: If the variable is not provided, the default value will be used. If you set a default value, it should be enclosed in double quotes. - `${varName:}`: If the variable is not provided, an empty string will be used. If you do not set the variable value through `-i`, you can also pass the value by setting the system environment variables. Variable substitution supports obtaining variable values through environment variables. For example, you can set the environment variable in the shell script as follows: ```shell export varName="value with space" ``` Then you can use the variable in the config file. If you set a variable without a default value in the configuration file but do not pass it during execution, the value of the variable will be retained and the system will not throw an exception. But please ensure that other processes can correctly parse the variable value. For example, ElasticSearch's index needs to support a format like '${xxx}' to dynamically specify the index. If other processes are not supported, the program may not run properly. ### Example: ```hocon env { job.mode = "BATCH" job.name = ${jobName} parallelism = 2 } source { FakeSource { plugin_output = "${resName:fake_test}_table" row.num = "${rowNum:50}" string.template = ${strTemplate} int.template = [20, 21] schema = { fields { name = "${nameType:string}" age = ${ageType} } } } } transform { sql { plugin_input = "${resName:fake_test}_table" plugin_output = "sql" query = "select * from ${resName:fake_test}_table where name = '${nameVal}' " } } sink { Console { plugin_input = "sql" username = ${username} password = ${password} } } ``` In the configuration above, we have defined several variables like `${rowNum}`, `${resName}`. We can replace these parameters using the following shell command: ```shell ./bin/seatunnel.sh -c -i jobName='this_is_a_job_name' -i strTemplate=['abc','d~f','hi'] -i ageType=int -i nameVal=abc -i username=seatunnel=2.3.1 -i password='$a^b%c.d~e0*9(' -m local ``` In this case, `resName`, `rowNum`, and `nameType` are not set, so they will take their default values. The final submitted configuration would be: ```hocon env { job.mode = "BATCH" job.name = "this_is_a_job_name" parallelism = 2 } source { FakeSource { plugin_output = "fake_test_table" row.num = 50 string.template = ['abc','d~f','hi'] int.template = [20, 21] schema = { fields { name = "string" age = "int" } } } } transform { sql { plugin_input = "fake_test_table" plugin_output = "sql" query = "select * from dual where name = 'abc' " } } sink { Console { plugin_input = "sql" username = "seatunnel=2.3.1" password = "$a^b%c.d~e0*9(" } } ``` ### Important Notes: - If a value contains special characters like `(`, enclose it in single quotes (`'`). - If the substitution variable contains double or single quotes (e.g., `"resName"` or `"nameVal"`), you need to include them with the value. - The value cannot contain spaces (`' '`). For example, `-i jobName='this is a job name'` will be replaced with `job.name = "this"`. You can use environment variables to pass values with spaces. - For dynamic parameters, you can use the following format: `-i date=$(date +"%Y%m%d")`. - Cannot use specified system reserved characters; they will not be replaced by `-i`, such as: `${database_name}`, `${schema_name}`, `${table_name}`, `${schema_full_name}`, `${table_full_name}`, `${primary_key}`, `${unique_key}`, `${field_names}`, `${partition_keys}`. For details, please refer to [Sink Parameter Placeholders](../configuration/sink-options-placeholders.md). ## What's More - Start write your own config file now, choose the [connector](../connectors/source) you want to use, and configure the parameters according to the connector's documentation. - If you want to know the details of the format configuration, please see [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md). ================================================ FILE: docs/en/introduction/concepts/connector-v2-features.md ================================================ # Intro To Connector V2 Features ## Differences Between Connector V2 And V1 Since https://github.com/apache/seatunnel/issues/1608 We Added Connector V2 Features. Connector V2 is a connector defined based on the SeaTunnel Connector API interface. Unlike Connector V1, V2 supports the following features: * **Multi Engine Support** SeaTunnel Connector API is an engine independent API. The connectors developed based on this API can run in multiple engines. Currently, Flink and Spark are supported, and we will support other engines in the future. * **Multi Engine Version Support** Decoupling the connector from the engine through the translation layer solves the problem that most connectors need to modify the code in order to support a new version of the underlying engine. * **Unified Batch And Stream** Connector V2 can perform batch processing or streaming processing. We do not need to develop connectors for batch and stream separately. * **Multiplexing JDBC/Log connection.** Connector V2 supports JDBC resource reuse and sharing database log parsing. * **Multimodal Data Integration** Connector V2 supports multimodal data integration, including structured and unstructured text data, video, images, binary files, etc. ## Source Connector Features Source connectors have some common core features, and each source connector supports them to varying degrees. ### exactly-once If each piece of data in the data source will only be sent downstream by the source once, we think this source connector supports exactly once. In SeaTunnel, we can save the read **Split** and its **offset** (The position of the read data in split at that time, such as line number, byte size, offset, etc.) as **StateSnapshot** when checkpointing. If the task restarted, we will get the last **StateSnapshot** and then locate the **Split** and **offset** read last time and continue to send data downstream. For example `File`, `Kafka`. ### column projection If the connector supports reading only specified columns from the data source (Note that if you read all columns first and then filter unnecessary columns through the schema, this method is not a real column projection) For example `JDBCSource` can use sql to define reading columns. `KafkaSource` will read all content from topic and then use `schema` to filter unnecessary columns, This is not `column projection`. ### batch Batch Job Mode, The data read is bounded and the job will stop after completing all data read. ### stream Streaming Job Mode, The data read is unbounded and the job never stop. ### parallelism Parallelism Source Connector support config `parallelism`, every parallelism will create a task to read the data. In the **Parallelism Source Connector**, the source will be split into multiple splits, and then the enumerator will allocate the splits to the SourceReader for processing. ### multimodal Support multimodal data integration, including structured and unstructured text data, video, images, binary files, etc. ### support user-defined split User can config the split rule. ### support multiple table read Supports reading multiple tables in one SeaTunnel job ## Sink Connector Features Sink connectors have some common core features, and each sink connector supports them to varying degrees. ### exactly-once When any piece of data flows into a distributed system, if the system processes any piece of data accurately only once in the whole processing process and the processing results are correct, it is considered that the system meets the exact once consistency. For sink connector, the sink connector supports exactly-once if any piece of data only write into target once. There are generally two ways to achieve this: * The target database supports key deduplication. For example `MySQL`, `Kudu`. * The target support **XA Transaction**(This transaction can be used across sessions. Even if the program that created the transaction has ended, the newly started program only needs to know the ID of the last transaction to resubmit or roll back the transaction). Then we can use **Two-phase Commit** to ensure **exactly-once**. For example `File`, `MySQL`. ### cdc(change data capture) If a sink connector supports writing row kinds(INSERT/UPDATE_BEFORE/UPDATE_AFTER/DELETE) based on primary key, we think it supports cdc(change data capture). ### support multiple table write Supports write multiple tables in one SeaTunnel job, users can dynamically specify the table's identifier by [configuring placeholders](../configuration/sink-options-placeholders.md). ### multimodal Support multimodal data integration, including structured and unstructured text data, video, images, binary files, etc. ================================================ FILE: docs/en/introduction/concepts/gravitino-type-mapping.md ================================================ # Gravitino Type Mapping This document describes the type mapping between Apache Gravitino and SeaTunnel when using Gravitino as the metadata source. The type conversion is handled by `GravitinoTableSchemaConvertor`. ## Overview When SeaTunnel reads table schema from Gravitino, the Gravitino column types are automatically converted to corresponding SeaTunnel data types. This mapping enables seamless integration between Gravitino-managed metadata and SeaTunnel's data processing pipeline. ## Primitive Type Mapping | Gravitino Type | Gravitino JSON Representation | SeaTunnel Type | SeaTunnel Type Keyword | Java Type | Notes | |:-----------------|:------------------------------|:--------------------------------------|:-----------------------|:---------------------------|:----------------------------------------------------------| | Boolean | `boolean` | `BasicType.BOOLEAN_TYPE` | `boolean` | `java.lang.Boolean` | - | | Byte | `byte` | `BasicType.BYTE_TYPE` | `tinyint` | `java.lang.Byte` | - | | Unsigned Byte | `byte unsigned` | `BasicType.BYTE_TYPE` | `tinyint` | `java.lang.Byte` | Unsigned flag is ignored | | Short | `short` | `BasicType.SHORT_TYPE` | `smallint` | `java.lang.Short` | - | | Unsigned Short | `short unsigned` | `BasicType.SHORT_TYPE` | `smallint` | `java.lang.Short` | Unsigned flag is ignored | | Integer | `integer` | `BasicType.INT_TYPE` | `int` | `java.lang.Integer` | - | | Unsigned Integer | `integer unsigned` | `BasicType.INT_TYPE` | `int` | `java.lang.Integer` | Unsigned flag is ignored | | Long | `long` | `BasicType.LONG_TYPE` | `bigint` | `java.lang.Long` | - | | Unsigned Long | `long unsigned` | `BasicType.LONG_TYPE` | `bigint` | `java.lang.Long` | Unsigned flag is ignored | | Float | `float` | `BasicType.FLOAT_TYPE` | `float` | `java.lang.Float` | Single-precision floating point | | Double | `double` | `BasicType.DOUBLE_TYPE` | `double` | `java.lang.Double` | Double-precision floating point | | Decimal | `decimal(p, s)` | `DecimalType(p, s)` | `"decimal(p,s)"` | `java.math.BigDecimal` | Precision: 1-38, Scale: 0-precision | | String | `string` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Variable-length string | | FixedChar | `char(l)` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Fixed-length string, length stored in columnLength | | VarChar | `varchar(l)` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Variable-length string, max length stored in columnLength | | UUID | `uuid` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Universally unique identifier | | Date | `date` | `LocalTimeType.LOCAL_DATE_TYPE` | `date` | `java.time.LocalDate` | Date without time | | Time | `time` | `LocalTimeType.LOCAL_TIME_TYPE` | `time` | `java.time.LocalTime` | Time without date | | Timestamp | `timestamp(p)` | `LocalTimeType.LOCAL_DATE_TIME_TYPE` | `timestamp` | `java.time.LocalDateTime` | Timestamp without timezone, p=0-12 | | TimestampTz | `timestamp_tz(p)` | `LocalTimeType.OFFSET_DATE_TIME_TYPE` | `timestamp_tz` | `java.time.OffsetDateTime` | Timestamp with timezone, p=0-12 | | Binary | `binary` | `PrimitiveByteArrayType.INSTANCE` | `bytes` | `byte[]` | Variable-length binary | | Fixed | `fixed(l)` | `PrimitiveByteArrayType.INSTANCE` | `bytes` | `byte[]` | Fixed-length binary | | IntervalYear | `interval_year` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Year-month interval | | IntervalDay | `interval_day` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | Day-time interval | ## Complex Type Mapping | Gravitino Type | Gravitino JSON Representation | SeaTunnel Type | SeaTunnel Type Keyword | Notes | |:---------------|:------------------------------------------------------------------------------------|:------------------------|:------------------------------------|:--------------------------------------------| | List | `{"type": "list", "elementType": type, "containsNull": boolean}` | `ArrayType` | `"array"` | T is the element type | | Map | `{"type": "map", "keyType": type, "valueType": type, "valueContainsNull": boolean}` | `MapType` | `"map"` | K is key type, V is value type | | Struct | `{"type": "struct", "fields": [...]}` | `SeaTunnelRowType` | `{field1=type1, field2=type2, ...}` | Nested row type | | External | `{"type": "external", "catalogString": "user-defined"}` | `BasicType.STRING_TYPE` | `string` | For unsupported types like PostgreSQL jsonb | | Union | `{"type": "union", "types": [...]}` | Not Supported | - | Throws conversion error | ## Type Parameter Extraction The converter extracts type parameters for column metadata: | Type | Parameter | Extracted As | Notes | |:------------------|:-----------------|:------------------------------------|:------------------------------------| | `decimal(p, s)` | precision, scale | columnLength=precision, scale=scale | Both values stored | | `varchar(l)` | length | columnLength=length | Maximum string length | | `char(l)` | length | columnLength=length | Fixed string length | | `fixed(l)` | length | columnLength=length | Fixed binary length | | `timestamp(p)` | precision | columnLength=precision | Fractional seconds precision (0-12) | | `timestamp_tz(p)` | precision | columnLength=precision | Fractional seconds precision (0-12) | ## Index and Constraint Mapping Gravitino indexes are mapped to SeaTunnel constraints: | Gravitino Index Type | SeaTunnel Constraint Type | Notes | |:---------------------|:---------------------------|:--------------------------------------------| | `PRIMARY_KEY` | `PrimaryKey` | Extracts column names from fieldNames array | | `UNIQUE_KEY` | `ConstraintKey.UNIQUE_KEY` | Column sort order defaults to ASC | ## Notes and Limitations 1. **Case Insensitivity**: Type matching is case-insensitive. `BOOLEAN`, `boolean`, and `Boolean` are treated the same. 2. **Unsigned Types**: The `unsigned` modifier for numeric types is recognized but does not affect the converted SeaTunnel type. SeaTunnel uses signed types internally. 3. **External Types**: When Gravitino encounters a type it cannot parse (such as PostgreSQL's `jsonb`), it represents it as an `external` type. SeaTunnel converts these to `string` type. 4. **Union Types**: Gravitino's `union` type is not currently supported and will throw a conversion error. 5. **Nullable**: The `nullable` attribute in Gravitino column definitions is preserved in the SeaTunnel `Column` metadata. 6. **Decimal Parameters**: The `decimal` type requires both precision and scale parameters. Decimal values without parameters or with invalid format will throw an error. ## Related Documentation - [Gravitino Column Types](https://gravitino.apache.org/docs/1.1.0/manage-relational-metadata-using-gravitino/#apache-gravitino-table-column-type) - [Schema Feature](./schema-feature.md) - [SeaTunnel Data Types](../common-options.md) ================================================ FILE: docs/en/introduction/concepts/incompatible-changes.md ================================================ # Incompatible Changes This document records the incompatible updates between each version. You need to check this document before you upgrade to related version. ## dev ### API Changes - **Breaking Change: Engine REST table metrics key format** - **Affected component**: SeaTunnel Engine REST API (job metrics in `/job-info`) - **Description**: To support multiple Sources/Sinks/Transforms processing the same table, the key format of table-level metrics has changed from `{tableName}` to `{VertexIdentifier}.{tableName}` (for example, `Sink[0].fake.user_table`). - **Impact**: Existing Grafana dashboards, Prometheus alert rules, and custom monitoring integrations that reference the old keys must be updated. **Before** ```json { "TableSinkWriteCount": { "fake.user_table": "15" } } ``` **After** ```json { "TableSinkWriteCount": { "Sink[0].fake.user_table": "10", "Sink[1].fake.user_table": "5" } } ``` ### Configuration Changes ### Connector Changes ### Transform Changes - **[BREAKING]** SQL Transform `PARSEDATETIME`, `TO_DATE`, and `IS_DATE` functions now only accept whitelisted datetime format patterns. Custom format patterns that were previously accepted will now fail at runtime. The supported patterns are: - DateTime: `yyyy-MM-dd HH:mm:ss`, `yyyy-MM-dd HH:mm:ss.SSS`, `yyyy-MM-dd'T'HH:mm:ss`, `yyyy-MM-dd'T'HH:mm:ss.SSS`, `yyyy/MM/dd HH:mm:ss`, `yyyy/MM/dd HH:mm:ss.SSS`, `yyyyMMddHHmmss` - Date: `yyyy-MM-dd`, `yyyy/MM/dd`, `yyyyMMdd` - Time: `HH:mm:ss`, `HH:mm:ss.SSS`, `HHmmss` **Exception Type Change**: Invalid datetime format patterns now throw `SeaTunnelRuntimeException` instead of `TransformException`. If you have error handling or monitoring systems that catch `TransformException` for datetime parsing errors, you will need to update them to handle `SeaTunnelRuntimeException`. **Migration Guide**: If you are using custom datetime format patterns in `PARSEDATETIME`, `TO_DATE`, or `IS_DATE` functions, you must update your queries to use one of the supported patterns above. If your data uses a different format, you may need to preprocess the input data to match a supported format, or use string manipulation functions to transform the format before parsing. - DataValidator transform: In `row_error_handle_way = ROUTE_TO_TABLE` mode, the routed error row `table_id` now includes the upstream database/schema prefix (for example, `db1.ffp` / `db1.schema1.ffp` instead of `ffp`). - Adjusted SQL Transform date & time functions: - `DATEDIFF(, , 'MONTH')` now returns the total number of months between the two dates across years (for example, from `2023-01-01` to `2024-03-01` returns `14` instead of `15`). - `WEEK()` now returns the ISO week number directly (previous behavior added an extra `+1` to the ISO week value). ### Engine Behavior Changes ### Dependency Upgrades ================================================ FILE: docs/en/introduction/concepts/schema-feature.md ================================================ # Intro To Schema Feature ## Why We Need Schema Some NoSQL databases or message queue are not strongly limited schema, so the schema cannot be obtained through the api. At this time, a schema needs to be defined to convert to TableSchema and obtain data. ## SchemaOptions We can use SchemaOptions to define schema, the SchemaOptions contains some configs to define the schema. e.g. columns, primaryKey, constraintKeys. ``` schema = { table = "database.schema.table" schema_first = false comment = "comment" partition_keys = ["dt"] columns = [ ... ] primaryKey { ... } constraintKeys { ... } } ``` ### table The table full name of the table identifier which the schema belongs to, it contains database, schema, table name. e.g. `database.schema.table`, `database.table`, `table`. ### schema_url Get the http url of metadata information through restApi, such as: `http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > When using Gravitino as the metadata source, the column types from Gravitino will be automatically converted to SeaTunnel data types. For detailed type mapping information, please refer to [Gravitino Type Mapping](./gravitino-type-mapping.md). #### schema_url Examples **1. Single table with table and schema_url:** ```hocon source { LocalFile { path = "/tmp/data" file_format_type = "json" schema { table = "db.table2" schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } } ``` **2. Single table with schema_url only (without table attribute):** ```hocon source { LocalFile { path = "/tmp/data" file_format_type = "json" schema { schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } } ``` **3. Multi-table with columns and schema_url:** ```hocon source { LocalFile { tables_configs = [ { path = "/tmp/data/table1" file_format_type = "json" schema { table = "db.table1" columns = [ { name = id, type = bigint, nullable = false }, { name = name, type = string }, { name = age, type = int } ] } }, { path = "/tmp/data/table2" file_format_type = "json" schema { table = "db.table2" schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } ] } } ``` ### schema_first Default is false. If the schema_first is true, the schema will be used first, this means if we set `table = "a.b"`, `a` will be parsed as schema rather than database, then we can support write `table = "schema.table"`. ### comment The comment of the CatalogTable which the schema belongs to. ### partition_keys The partition keys of the CatalogTable which the schema belongs to. This metadata can be used by sink options placeholders such as `${partition_keys}` (for example, to create partitioned Iceberg tables in multi-table sync jobs). ### Columns Columns is a list of configs used to define the column in schema, each column can contains name, type, nullable, defaultValue, comment field. ``` columns = [ { name = id type = bigint nullable = false columnLength = 20 defaultValue = 0 comment = "primary key id" } ] ``` | Field | Required | Default Value | Description | |:-------------|:---------|:--------------|----------------------------------------------------------------------------------| | name | Yes | - | The name of the column | | type | Yes | - | The data type of the column | | nullable | No | true | If the column can be nullable | | columnLength | No | 0 | The length of the column which will be useful when you need to define the length | | columnScale | No | - | The scale of the column which will be useful when you need to define the scale | | defaultValue | No | null | The default value of the column | | comment | No | null | The comment of the column | #### What type supported at now | Data type | Value type in Java | Description | |:-------------|:---------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | string | `java.lang.String` | string | | boolean | `java.lang.Boolean` | boolean | | tinyint | `java.lang.Byte` | -128 to 127 regular. 0 to 255 unsigned*. Specify the maximum number of digits in parentheses. | | smallint | `java.lang.Short` | -32768 to 32767 General. 0 to 65535 unsigned*. Specify the maximum number of digits in parentheses. | | int | `java.lang.Integer` | All numbers from -2,147,483,648 to 2,147,483,647 are allowed. | | bigint | `java.lang.Long` | All numbers between -9,223,372,036,854,775,808 and 9,223,372,036,854,775,807 are allowed. | | float | `java.lang.Float` | Float-precision numeric data from -1.79E+308 to 1.79E+308. | | double | `java.lang.Double` | Double precision floating point. Handle most decimals. | | decimal | `java.math.BigDecimal` | Double type stored as a string, allowing a fixed decimal point. | | null | `java.lang.Void` | null | | bytes | `byte[]` | bytes | | date | `java.time.LocalDate` | Only the date is stored. From January 1, 0001 to December 31, 9999. | | time | `java.time.LocalTime` | Only store time. Accuracy is 100 nanoseconds. | | timestamp | `java.time.LocalDateTime` | Stores date and time information without time zone. Represents the time of an event in local time. It does not include any offset or zone information. | | timestamp_tz | `java.time.OffsetDateTime` | Stores date and time information with an offset from UTC. It includes both the local date-time and the offset from UTC, providing more precise temporal information when working with multiple time zones. | | row | `org.apache.seatunnel.api.table.type.SeaTunnelRowType` | Row type, can be nested. | | map | `java.util.Map` | A Map is an object that maps keys to values. The key type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map` `row`. | | array | `ValueType[]` | A array is a data type that represents a collection of elements. The element type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double`. | #### How to declare type supported SeaTunnel provides a simple and direct way to declare basic types. Basic type keywords include `string`, `boolean`, `tinyint`, `smallint`, `int`, `bigint`, `float`, `double`, `date`, `time`, `timestamp`, and `null`. The keyword names for basic types can be used directly as type declarations, and SeaTunnel is case-insensitive to type keywords. For example, if you need to declare a field with integer type, you can simply define the field as `int` or `"int"`. > The null type declaration must be enclosed in double quotes, like `"null"`. This approach helps avoid confusion with [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md)'s `null` type which represents undefined object. When declaring complex types (such as **decimal**, **array**, **map**, and **row**), pay attention to specific considerations. - When declaring a decimal type, precision and scale settings are required, and the type definition follows the format `decimal(precision, scale)`. It's essential to emphasize that the declaration of the decimal type must be enclosed in `"`; you cannot use the type name directly, as with basic types. For example, when declaring a decimal field with precision 10 and scale 2, you specify the field type as `"decimal(10,2)"`. - When declaring an array type, you need to specify the element type, and the type definition follows the format `array`, where `T` represents the element type. The element type includes `int`,`string`,`boolean`,`tinyint`,`smallint`,`bigint`,`float` and `double`. Similar to the decimal type declaration, it also be enclosed in `"`. For example, when declaring a field with an array of integers, you specify the field type as `"array"`. - When declaring a map type, you need to specify the key and value types. The map type definition follows the format `map`, where `K` represents the key type and `V` represents the value type. `K` can be any basic type and decimal type, and `V` can be any type supported by SeaTunnel. Similar to previous type declarations, the map type declaration must be enclosed in double quotes. For example, when declaring a field with map type, where the key type is string and the value type is integer, you can declare the field as `"map"`. - When declaring a row type, you need to define a [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) object to describe the fields and their types. The field types can be any type supported by SeaTunnel. For example, when declaring a row type containing an integer field `a` and a string field `b`, you can declare it as `{a = int, b = string}`. Enclosing the definition in `"` as a string is also acceptable, so `"{a = int, b = string}"` is equivalent to `{a = int, c = string}`. Since HOCON is compatible with JSON, `"{\"a\":\"int\", \"b\":\"string\"}"` is equivalent to `"{a = int, b = string}"`. Here is an example of complex type declarations: ```hocon schema { fields { c_decimal = "decimal(10, 2)" c_array = "array" c_row = { c_int = int c_string = string c_row = { c_int = int } } # Hocon style declare row type in generic type map0 = "map" # Json style declare row type in generic type map1 = "map" } } ``` ### PrimaryKey Primary key is a config used to define the primary key in schema, it contains name, columns field. ``` primaryKey { name = id columns = [id] } ``` | Field | Required | Default Value | Description | |:--------|:---------|:--------------|-----------------------------------| | name | Yes | - | The name of the primaryKey | | columns | Yes | - | The column list in the primaryKey | ### ConstraintKeys Constraint keys is a list of config used to define the constraint keys in schema, it contains constraintName, constraintType, constraintColumns field. ``` constraintKeys = [ { constraintName = "id_index" constraintType = KEY constraintColumns = [ { columnName = "id" sortType = ASC } ] }, ] ``` | Field | Required | Default Value | Description | |:------------------|:---------|:--------------|-------------------------------------------------------------------------------------------------------------------------------------------| | constraintName | Yes | - | The name of the constraintKey | | constraintType | No | KEY | The type of the constraintKey | | constraintColumns | Yes | - | The column list in the primaryKey, each column should contains constraintType and sortType, sortType support ASC and DESC, default is ASC | #### What constraintType supported at now | ConstraintType | Description | |:---------------|:------------| | INDEX_KEY | key | | UNIQUE_KEY | unique key | ## Multi table schemas ``` tables_configs = [ { schema { table = "database.schema.table1" schema_first = false comment = "comment" columns = [ ... ] primaryKey { ... } constraintKeys { ... } } }, { schema = { table = "database.schema.table2" schema_first = false comment = "comment" columns = [ ... ] primaryKey { ... } constraintKeys { ... } } } ] ``` ## How to use schema ### Recommended ``` source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema { table = "FakeDatabase.FakeTable" columns = [ { name = id type = bigint nullable = false defaultValue = 0 comment = "primary key id" }, { name = name type = "string" nullable = true comment = "name" }, { name = age type = int nullable = true comment = "age" } ] primaryKey { name = "id" columnNames = [id] } constraintKeys = [ { constraintName = "unique_name" constraintType = UNIQUE_KEY constraintColumns = [ { columnName = "name" sortType = ASC } ] }, ] } } } ``` ### Deprecated If you only need to define the column, you can use fields to define the column, this is a simple way but will be remove in the future. ``` source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } ``` ## When we should use it or not If there is a `schema` configuration project in Options,the connector can then customize the schema. Like `Fake` `Pulsar` `Http` source connector etc. ================================================ FILE: docs/en/introduction/configuration/JobEnvConfig.md ================================================ # Job Env Config This document describes env configuration information. The common parameters can be used in all engines. In order to better distinguish between engine parameters, the additional parameters of other engine need to carry a prefix. In flink engine, we use `flink.` as the prefix. In the spark engine, we do not use any prefixes to modify parameters, because the official spark parameters themselves start with `spark.` ## Common Parameter The following configuration parameters are common to all engines. ### job.name This parameter configures the task name. ### jars Third-party packages can be loaded via `jars`, like `jars="file://local/jar1.jar;file://local/jar2.jar"`. ### job.mode You can configure whether the task is in batch or stream mode through `job.mode`, like `job.mode = "BATCH"` or `job.mode = "STREAMING"` ### checkpoint.interval Gets the interval (milliseconds) in which checkpoints are periodically scheduled. In `STREAMING` mode, checkpoints is required, if you do not set it, it will be obtained from the application configuration file `seatunnel.yaml`. In `BATCH` mode, you can disable checkpoints by not setting this parameter. In Zeta `STREAMING` mode, the default value is 30000 milliseconds. ### checkpoint.timeout The timeout (in milliseconds) for a checkpoint. If the checkpoint is not completed before the timeout, the job will fail. In Zeta, the default value is 30000 milliseconds. ### parallelism This parameter configures the parallelism of source and sink. ### shade.identifier Specify the method of encryption, if you didn't have the requirement for encrypting or decrypting config files, this option can be ignored. For more details, you can refer to the documentation [Config Encryption Decryption](../concepts/config-encryption-decryption.md) ## Zeta Engine Parameter ### job.retry.times Used to control the default retry times when a job fails. The default value is 3, and it only works in the Zeta engine. ### job.retry.interval.seconds Used to control the default retry interval when a job fails. The default value is 3 seconds, and it only works in the Zeta engine. ### savemode.execute.location This parameter is used to specify the location of the savemode when the job is executed in the Zeta engine. The default value is `CLUSTER`, which means that the savemode is executed on the cluster. If you want to execute the savemode on the client, you can set it to `CLIENT`. Please use `CLUSTER` mode as much as possible, because when there are no problems with `CLUSTER` mode, we will remove `CLIENT` mode. ## Flink Engine Parameter Here are some SeaTunnel parameter names corresponding to the names in Flink, not all of them. Please refer to the official [Flink Documentation](https://flink.apache.org/). | Flink Configuration Name | SeaTunnel Configuration Name | |---------------------------------|---------------------------------------| | pipeline.max-parallelism | flink.pipeline.max-parallelism | | execution.checkpointing.mode | flink.execution.checkpointing.mode | | execution.checkpointing.timeout | flink.execution.checkpointing.timeout | | ... | ... | ## Spark Engine Parameter Because Spark configuration items have not been modified, they are not listed here, please refer to the official [Spark Documentation](https://spark.apache.org/). ================================================ FILE: docs/en/introduction/configuration/config-encryption-decryption.md ================================================ # Config File Encryption And Decryption ## Introduction In most production environments, sensitive configuration items such as passwords are required to be encrypted and cannot be stored in plain text, SeaTunnel provides a convenient one-stop solution for this. ## How to use SeaTunnel comes with the function of base64 encryption and decryption, but it is not recommended for production use, it is recommended that users implement custom encryption and decryption logic. You can refer to this chapter [How to implement user-defined encryption and decryption](#How to implement user-defined encryption and decryption) get more details about it. Base64 encryption support encrypt the following parameters by default: - username - password - auth - token - access_key - secret_key And users can add custom parameters to `shade.options` for encryption and decryption. Next, I'll show how to quickly use SeaTunnel's own `base64` encryption: 1. And new option `shade.identifier` and `shade.options` in env block of config file, `shade.identifier` indicate what the encryption method that you want to use, while `shade.options` specifies which parameters should be encrypted/decrypted. In this example, we should add `shade.identifier = base64` in config as the following shown: ```hocon # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # env { parallelism = 1 shade.identifier = "base64" shade.options = ["username", "password", "f1", "config1.f1", "config2.list"] } source { MySQL-CDC { plugin_output = "fake" parallelism = 1 server-id = 5656 port = 56725 hostname = "127.0.0.1" username = "seatunnel" password = "seatunnel_password" database-name = "inventory_vwyw0n" table-name = "products" url = "jdbc:mysql://localhost:56725" f1 = "seatunnel" # custom shade options config1.f1 = "seatunnel" config2.list = ["seatunnel", "seatunnel", "seatunnel"] } } transform { } sink { # choose stdout output plugin to output data to console Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "seatunnel" password = "seatunnel_password" # cdc options primary_key = "id" support_upsert = true } } ``` 2. Using the shell based on different calculate engine to encrypt config file, in this example we use zeta: ```shell ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --encrypt ``` Then you can see the encrypted configuration file in the terminal: ```log 2023-02-20 17:50:58,319 INFO org.apache.seatunnel.core.starter.command.ConfEncryptCommand - Encrypt config: { "env" : { "parallelism" : 1, "shade.identifier" : "base64" }, "source" : [ { "url" : "jdbc:mysql://localhost:56725", "hostname" : "127.0.0.1", "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", "port" : 56725, "database-name" : "inventory_vwyw0n", "parallelism" : 1, "plugin_output" : "fake", "table-name" : "products", "plugin_name" : "MySQL-CDC", "server-id" : 5656, "username" : "c2VhdHVubmVs", "f1" : "c2VhdHVubmVs", "config1.f1" : "c2VhdHVubmVs", "config2.list" : ["c2VhdHVubmVs","c2VhdHVubmVs","c2VhdHVubmVs"] } ], "transform" : [], "sink" : [ { "database" : "default", "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", "support_upsert" : true, "host" : "localhost:8123", "plugin_name" : "Clickhouse", "primary_key" : "id", "table" : "fake_all", "username" : "c2VhdHVubmVs" } ] } ``` 3. Of course, not only encrypted configuration files are supported, but if the user wants to see the decrypted configuration file, you can execute this command: ```shell ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --decrypt ``` ## How to implement user-defined encryption and decryption If you want to customize the encryption method and the configuration of the encryption, this section will help you to solve the problem. 1. Create a java maven project 2. Add `seatunnel-api` module with the provided scope in dependencies like the following shown: ```xml org.apache.seatunnel seatunnel-api ${seatunnel.version} provided ``` 3. Create a new class and implement interface `ConfigShade`, this interface has the following methods: ```java /** * The interface that provides the ability to encrypt and decrypt {@link * org.apache.seatunnel.shade.com.typesafe.config.Config} */ public interface ConfigShade { /** * The unique identifier of the current interface, used it to select the correct {@link * ConfigShade} */ String getIdentifier(); /** * Encrypt the content * * @param content The content to encrypt */ String encrypt(String content); /** * Decrypt the content * * @param content The content to decrypt */ String decrypt(String content); /** To expand the options that user want to encrypt */ default String[] sensitiveOptions() { return new String[0]; } } ``` 4. Create a file named `org.apache.seatunnel.api.configuration.ConfigShade` in `resources/META-INF/services`, the file content should be the fully qualified class name of the class that you defined in step 3. 5. Package it to jar and add jar to `${SEATUNNEL_HOME}/lib` 6. Change the option `shade.identifier` to the value that you defined in `ConfigShade#getIdentifier`of you config file, please enjoy it \^_\^ ### How to encrypt and decrypt with customized params If you want to encrypt and decrypt with customized params, you can follow the steps below: 1. Add a configuration named `shade.properties` in the env part of the configuration file, the value of this configuration is in the form of key-value pairs (the type of the key must be a string), as shown below: ```hocon env { shade.properties = { suffix = "666" } } ``` 2. Override the `ConfigShade` interface's `open` method, as shown below: ```java public static class ConfigShadeWithProps implements ConfigShade { private String suffix; private String identifier = "withProps"; @Override public void open(Map props) { this.suffix = String.valueOf(props.get("suffix")); } } ``` 3. Use the parameters passed in the open method in the encryption and decryption methods, as shown below: ```java public String encrypt(String content) { return content + suffix; } public String decrypt(String content) { return content.substring(0, content.length() - suffix.length()); } ``` ================================================ FILE: docs/en/introduction/configuration/metalake.md ================================================ # METALAKE Since Seatunnel requires database usernames, passwords, and other sensitive information to be written in plaintext within scripts when executing tasks, this may lead to information leakage and is also difficult to maintain. When data source information changes, manual modifications are often required. To address this, Metalake is introduced. Data source information can be stored in Metalake systems such as Apache Gravitino. Task scripts then use `sourceId` and placeholders instead of actual usernames and passwords. At runtime, the Seatunnel engine retrieves the information from Metalake via HTTP requests and replaces the placeholders accordingly. To enable Metalake, you first need to modify the environment variables in **seatunnel-env.sh**: * `METALAKE_ENABLED` * `METALAKE_TYPE` * `METALAKE_URL` Set `METALAKE_ENABLED` to `true`. Currently, `METALAKE_TYPE` only supports `gravitino`. For Apache Gravitino, set `METALAKE_URL` to: ``` http://host:port/api/metalakes/your_metalake_name/catalogs/ ``` --- ## Usage Example First, create a catalog in Gravitino, for example: ```bash curl -L 'http://localhost:8090/api/metalakes/test_metalake/catalogs' \ -H 'Content-Type: application/json' \ -H 'Accept: application/vnd.gravitino.v1+json' \ -d '{ "name": "test_catalog", "type": "relational", "provider": "jdbc-mysql", "comment": "for metalake test", "properties": { "jdbc-driver": "com.mysql.cj.jdbc.Driver", "jdbc-url": "not used", "jdbc-user": "root", "jdbc-password": "Abc!@#135_seatunnel" } }' ``` This creates a `test_catalog` under `test_metalake` (note: `metalake` itself must be created in advance). Thus, `METALAKE_URL` can be set to: ``` http://localhost:8090/api/metalakes/test_metalake/catalogs/ ``` You can then define the source as: ```hocon source { Jdbc { url = "jdbc:mysql://mysql-e2e:3306/seatunnel?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true" driver = "${jdbc-driver}" connection_check_timeout_sec = 100 sourceId = "test_catalog" user = "${jdbc-user}" password = "${jdbc-password}" query = "select * from source" } } ``` Here, `sourceId` refers to the catalog name, allowing other fields to use `${}` placeholders. At runtime, they will be automatically replaced. Note that in sinks, the same `sourceId` name is used, and placeholders must always start with `${` and end with `}`. Each item can contain at most one placeholder, and there can be content outside the placeholder as well. ================================================ FILE: docs/en/introduction/configuration/schema-evolution.md ================================================ # Schema evolution Schema Evolution means that the schema of a data table can be changed and the data synchronization task can automatically adapt to the changes of the new table structure without any other operations. ## Supported engines - Zeta ## Supported schema change event types - `ADD COLUMN` - `DROP COLUMN` - `RENAME COLUMN` - `MODIFY COLUMN` ## Supported connectors ### Source [Mysql-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/MySQL-CDC.md) [Oracle-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/Oracle-CDC.md) ### Sink [Jdbc-Mysql](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [Jdbc-Oracle](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [Jdbc-Postgres](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [Jdbc-Dameng](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [Jdbc-SqlServer](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [StarRocks](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/StarRocks.md) [Doris](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Doris.md) [Paimon](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Paimon.md#Schema-Evolution) [Elasticsearch](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Elasticsearch.md#Schema-Evolution) Note: * The schema evolution is not support the transform at now. The schema evolution of different types of databases(Oracle-CDC -> Jdbc-Mysql)is currently not supported the default value of the column in ddl. * When you use the Oracle-CDC,you can not use the username named `SYS` or `SYSTEM` to modify the table schema, otherwise the ddl event will be filtered out which can lead to the schema evolution not working. Otherwise, If your table name start with `ORA_TEMP_` will also has the same problem. * Earlier versions of `Dameng` databases do not support the change of `Varchar` type fields to `Text` type fields. ## Enable schema evolution Schema evolution is disabled by default in CDC source. You need configure `schema-changes.enabled = true` which is only supported in CDC to enable it. ## Examples ### Mysql-CDC -> Jdbc-Mysql ``` env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = shop table = mysql_cdc_e2e_sink_table_with_schema_change_exactly_once primary_keys = ["id"] is_exactly_once = true xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### Oracle-cdc -> Jdbc-Oracle ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" source.reader.close.timeout = 120000 connection.pool.size = 1 schema-changes.enabled = true } } sink { Jdbc { plugin_input = "customers" driver = "oracle.jdbc.driver.OracleDriver" url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user = "dbzuser" password = "dbz" generate_sink_sql = true database = "ORCLCDB" table = "DEBEZIUM.FULL_TYPES_SINK" batch_size = 1 primary_keys = ["ID"] connection.pool.size = 1 } } ``` ### Oracle-cdc -> Jdbc-Mysql ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" source.reader.close.timeout = 120000 connection.pool.size = 1 schema-changes.enabled = true } } sink { jdbc { plugin_input = "customers" url = "jdbc:mysql://oracle-host:3306/oracle_sink" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true # You need to configure both database and table database = oracle_sink table = oracle_cdc_2_mysql_sink_table primary_keys = ["ID"] } } ``` ### Mysql-cdc -> StarRocks ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { StarRocks { nodeUrls = ["starrocks_cdc_e2e:8030"] username = "root" password = "" database = "shop" table = "${table_name}" base-url = "jdbc:mysql://starrocks_cdc_e2e:9030/shop" max_retries = 3 enable_upsert_delete = true schema_save_mode="RECREATE_SCHEMA" data_save_mode="DROP_DATA" save_mode_create_template = """ CREATE TABLE IF NOT EXISTS shop.`${table_name}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP PRIMARY KEY (${rowtype_primary_key}) DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1", "in_memory" = "false", "enable_persistent_index" = "true", "replicated_storage" = "true", "compression" = "LZ4" ) """ } } ``` ### Mysql-CDC -> Doris ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Doris { fenodes = "doris_e2e:8030" username = "root" password = "" database = "shop" table = "products" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### Mysql-CDC -> Jdbc-Postgres ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:postgresql://postgresql:5432/shop" driver = "org.postgresql.Driver" user = "postgres" password = "postgres" generate_sink_sql = true database = shop table = "public.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ### Mysql-CDC -> Jdbc-Dameng ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:dm://e2e_dmdb:5236" driver = "dm.jdbc.driver.DmDriver" connection_check_timeout_sec = 1000 user = "SYSDBA" password = "SYSDBA" generate_sink_sql = true database = "DAMENG" table = "SYSDBA.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ### Mysql-CDC -> Jdbc-SqlServer ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:sqlserver://e2e_sqlserver:1433" driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" user = "sa" password = "paanssy1234$" generate_sink_sql = true database = master table = "dbo.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ================================================ FILE: docs/en/introduction/configuration/sink-options-placeholders.md ================================================ # Sink Options Placeholders ## Introduction The SeaTunnel provides a sink options placeholders feature that allows you to get upstream table metadata through placeholders. This functionality is essential when you need to dynamically get upstream table metadata (such as multi-table writes). This document will guide you through the usage of these placeholders and how to leverage them effectively. ## Support Those Engines > SeaTunnel Zeta
    > Flink
    > Spark
    ## Placeholder The placeholders are mainly controlled by the following expressions: - `${database_name}` - Used to get the database in the upstream catalog table - Default values can also be specified via expressions:`${database_name:default_my_db}` - `${schema_name}` - Used to get the schema in the upstream catalog table - Default values can also be specified via expressions:`${schema_name:default_my_schema}` - `${table_name}` - Used to get the table in the upstream catalog table - Default values can also be specified via expressions:`${table_name:default_my_table}` - `${schema_full_name}` - Used to get the schema full path(database & schema) in the upstream catalog table - `${table_full_name}` - Used to get the table full path(database & schema & table) in the upstream catalog table - `${primary_key}` - Used to get the table primary-key fields in the upstream catalog table - `${unique_key}` - Used to get the table unique-key fields in the upstream catalog table - `${field_names}` - Used to get the table field keys in the upstream catalog table - `${comment}` - Used to get the table comment in the upstream catalog table - `${partition_keys}` - Used to get the table partition keys in the upstream catalog table ## Configuration *Requires*: - Make sure the sink connector you are using has implemented `TableSinkFactory` API ### Example 1 ```hocon env { // ignore... } source { MySQL-CDC { // ignore... } } transform { // ignore... } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "${database_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` ### Example 2 ```hocon env { // ignore... } source { Oracle-CDC { // ignore... } } transform { // ignore... } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "${schema_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` We will complete the placeholder replacement before the connector is started, ensuring that the sink options is ready before use. If the variable is not replaced, it may be that the upstream table metadata is missing this option, for example: - `mysql` source not contain `${schema_name}` - `oracle` source not contain `${database_name}` - ... ================================================ FILE: docs/en/introduction/configuration/speed-limit.md ================================================ # Speed Control ## Introduction The SeaTunnel provides a powerful speed control feature that allows you to manage the rate at which data is synchronized. This functionality is essential when you need to ensure efficient and controlled data transfer between systems. The speed control is primarily governed by two key parameters: `read_limit.rows_per_second` and `read_limit.bytes_per_second`. This document will guide you through the usage of these parameters and how to leverage them effectively. ## Support Those Engines > SeaTunnel Zeta
    > Flink
    > Spark
    ## Configuration To use the speed control feature, you need to configure the `read_limit.rows_per_second` or `read_limit.bytes_per_second` parameters in your job config. Example env config in your config file: ```hocon env { job.mode=STREAMING job.name=SeaTunnel_Job read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { // ignore... } } transform { } sink { Console { } } ``` We have placed `read_limit.bytes_per_second` and `read_limit.rows_per_second` in the `env` parameters to finish the speed control configuration. You can configure both of these parameters simultaneously or choose to configure only one of them. The value of each `value` represents the maximum rate at which each thread is restricted. Therefore, when configuring the respective values, please take into account the parallelism of your tasks. ================================================ FILE: docs/en/introduction/configuration/sql-config.md ================================================ # SQL Configuration File Before writing the sql config file, please make sure that the name of the config file should end with `.sql`. ## Structure of SQL Configuration File The `SQL` configuration file appears as follows: ### SQL ```sql /* config env { parallelism = 1 job.mode = "BATCH" } */ CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type'='source', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'query' = 'select * from source', 'properties'= '{ useSSL = false, rewriteBatchedStatements = true }' ); CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type'='sink', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'seatunnel', 'table' = 'sink' ); INSERT INTO sink_table SELECT id, name, age, email FROM source_table; ``` ## Explanation of `SQL` Configuration File ### General Configuration in SQL File ```sql /* config env { parallelism = 1 job.mode = "BATCH" } */ ``` In the `SQL` file, common configuration sections are defined using `/* config */` comments. Inside, common configurations like `env` can be defined using `HOCON` format. ### SOURCE SQL Syntax ```sql CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type'='source', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'query' = 'select * from source', 'properties' = '{ useSSL = false, rewriteBatchedStatements = true }' ); ``` * Using `CREATE TABLE ... WITH (...)` syntax creates a mapping for the source table. The `TABLE` name is the name of the source-mapped table, and the `WITH` syntax contains source-related configuration parameters. * There are two fixed parameters in the WITH syntax: `connector` and `type`, representing connector plugin name (such as `jdbc`, `FakeSource`, etc.) and source type (fixed as `source`), respectively. * Other parameter names can reference relevant configuration parameters of the corresponding connector plugin, but the format needs to be changed to `'key' = 'value',`. * If `'value'` is a sub-configuration, you can directly use a string in `HOCON` format. Note: if using a sub-configuration in `HOCON` format, the internal property items must be separated by `,`, like this: ```sql 'properties' = '{ useSSL = false, rewriteBatchedStatements = true }' ``` * If using `'` within `'value'`, it needs to be escaped with `''`, like this: ```sql 'query' = 'select * from source where name = ''Joy Ding''' ``` ### SINK SQL Syntax ```sql CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type'='sink', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'seatunnel', 'table' = 'sink' ); ``` * Using `CREATE TABLE ... WITH (...)` syntax creates a mapping for the target table. The `TABLE` name is the name of the target-mapped table, and the `WITH` syntax contains sink-related configuration parameters. * There are two fixed parameters in the `WITH` syntax: `connector` and `type`, representing connector plugin name (such as `jdbc`, `console`, etc.) and target type (fixed as `sink`), respectively. * Other parameter names can reference relevant configuration parameters of the corresponding connector plugin, but the format needs to be changed to `'key' = 'value',`. ### INSERT INTO SELECT Syntax ```sql INSERT INTO sink_table SELECT id, name, age, email FROM source_table; ``` * The `SELECT FROM` part is the table name of the source-mapped table. If the select field has keyword([refrence](https://github.com/JSQLParser/JSqlParser/blob/master/src/main/jjtree/net/sf/jsqlparser/parser/JSqlParserCC.jjt)),you should use it like \`fieldName\`. ```sql INSERT INTO sink_table SELECT id, name, age, email,`output` FROM source_table; ``` * The `INSERT INTO` part is the table name of the target-mapped table. * Note: This syntax does **not support** specifying fields in `INSERT`, like this: `INSERT INTO sink_table (id, name, age, email) SELECT id, name, age, email FROM source_table;` ### INSERT INTO SELECT TABLE Syntax ```sql INSERT INTO sink_table SELECT source_table; ``` * The `SELECT` part directly uses the name of the source-mapped table, indicating that all data from the source table will be inserted into the target table. * Using this syntax does not generate related `transform` configurations. This syntax is generally used in multi-table synchronization scenarios. For example: ```sql CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type' = 'source', 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'table_list' = '[ { table_path = "source.table1" }, { table_path = "source.table2", query = "select * from source.table2" } ]' ); CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type' = 'sink', 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'sink' ); INSERT INTO sink_table SELECT source_table; ``` ### CREATE TABLE AS Syntax ```sql CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; ``` * This syntax creates a temporary table with the result of a `SELECT` query, used for `INSERT INTO` operations. * The syntax of the `SELECT` part refers to: [SQL Transform](../../transforms/sql.md) `query` configuration item ```sql CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; INSERT INTO sink_table SELECT * FROM temp1; ``` ## Example of SQL Configuration File Submission ```bash ./bin/seatunnel.sh --config ./config/sample.sql ``` ================================================ FILE: docs/en/introduction/how-it-works.md ================================================ --- sidebar_position: 2 --- # How it works ## Overview SeaTunnel is a distributed multimodal data integration tool with a pluggable architecture. It decouples the connector layer from the execution engine, allowing the same connectors to run on different engines. ``` ┌─────────────────────────────────────────────────────────────┐ │ Job Configuration │ │ (HOCON / SQL / Web UI) │ └─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ │ SeaTunnel Core │ │ (Job Parser, Coordinator, Scheduler) │ └─────────────────────────────────────────────────────────────┘ │ ┌─────────────────────┼─────────────────────┐ ▼ ▼ ▼ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ Source │────▶│ Transform │────▶│ Sink │ │ Connectors │ │ (Optional) │ │ Connectors │ └───────────────┘ └───────────────┘ └───────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ │ Execution Engine │ │ SeaTunnel Engine (Zeta) / Flink / Spark │ └─────────────────────────────────────────────────────────────┘ ``` ## Core Components ### 1. Connector API Engine-independent API for developing Source, Transform, and Sink connectors. | Component | Description | |-----------|-------------| | **Source** | Reads data from external systems (databases, files, message queues) | | **Transform** | Performs data transformations (field mapping, filtering, type conversion) | | **Sink** | Writes data to target systems | ### 2. Execution Engines | Engine | Best For | |--------|----------| | **SeaTunnel Engine (Zeta)** | Data synchronization, CDC, low resource usage | | **Apache Flink** | Complex stream processing, existing Flink infrastructure | | **Apache Spark** | Large-scale batch processing, existing Spark infrastructure | ### 3. Translation Layer Translates SeaTunnel's unified API to engine-specific implementations, enabling connector reuse across engines. ## Data Flow ``` Source ──▶ [Split] ──▶ Reader ──▶ Transform ──▶ Writer ──▶ Sink │ │ │ │ ▼ │ │ Checkpoint/State │ │ │ │ └───────────────────────┴────────────────────────┘ Fault Tolerance ``` **Key Features:** - Parallel reading with split-based distribution - Exactly-once semantics via distributed snapshots - Automatic failover and recovery ## Module Structure ``` seatunnel/ ├── seatunnel-api/ # Core API definitions ├── seatunnel-connectors-v2/ # Source & Sink connectors ├── seatunnel-transforms-v2/ # Transform plugins ├── seatunnel-engine/ # SeaTunnel Engine (Zeta) ├── seatunnel-translation/ # Engine adapters (Flink/Spark) ├── seatunnel-core/ # Job submission & CLI ├── seatunnel-formats/ # Data format handlers └── seatunnel-e2e/ # End-to-end tests ``` ## Job Execution Flow 1. **Parse** - Read and validate job configuration 2. **Plan** - Generate execution plan with parallelism 3. **Schedule** - Distribute tasks to workers 4. **Execute** - Run Source → Transform → Sink pipeline 5. **Monitor** - Track progress, metrics, and checkpoints ## Next Steps - [Engine Comparison](../engines/overview.md) - [Quick Start](../getting-started/locally/quick-start-seatunnel-engine.md) - [Connector List](../connectors/overview.md) ================================================ FILE: docs/en/tools/overview.md ================================================ --- sidebar_position: 1 --- # SeaTunnel Tools Overview Apache SeaTunnel Tools is a collection of auxiliary tools focused on developer and operator productivity, covering LLM integration, configuration conversion, and AI-powered assistance. ## Available Tools | Tool | Purpose | Status | |------|---------|--------| | [SeaTunnel Skill](seatunnel-skill) | Claude AI integration for SeaTunnel operations | Available | | [SeaTunnel MCP Server](seatunnel-mcp) | Model Context Protocol server for LLM integration | Available | | [x2seatunnel](x2seatunnel) | Configuration converter (DataX → SeaTunnel) | Available | ## Source Repository All tools are maintained in the [SeaTunnel Tools](https://github.com/apache/seatunnel-tools) repository. ================================================ FILE: docs/en/tools/seatunnel-mcp.md ================================================ --- sidebar_position: 3 --- # SeaTunnel MCP Server SeaTunnel MCP Server implements the [Model Context Protocol](https://modelcontextprotocol.io/) to enable LLM systems to interact with SeaTunnel resources. ## Overview The MCP server exposes SeaTunnel documentation, connector metadata, and job management capabilities as MCP resources and tools, allowing any MCP-compatible LLM client to assist with SeaTunnel operations. ## Getting Started Refer to the [SeaTunnel Tools repository](https://github.com/apache/seatunnel-tools/tree/main/seatunnel-mcp) for installation and configuration instructions. ================================================ FILE: docs/en/tools/seatunnel-skill.md ================================================ --- sidebar_position: 2 --- # SeaTunnel Skill SeaTunnel Skill is a Claude Code AI integration that provides instant assistance for SeaTunnel operations, configuration, and troubleshooting. ## Features - **AI-Powered Assistant**: Get instant help with SeaTunnel concepts and configurations - **Knowledge Integration**: Query official documentation and best practices - **Smart Debugging**: Analyze errors and suggest fixes - **Code Examples**: Generate configuration examples for your use case ## Installation ```bash # Clone the repository git clone https://github.com/apache/seatunnel-tools.git cd seatunnel-tools # Copy the skill to Claude Code skills directory cp -r seatunnel-skill ~/.claude/skills/ ``` ## Usage After installation, use the skill in Claude Code: ```bash # Query SeaTunnel documentation /seatunnel-skill "How do I configure a MySQL to PostgreSQL job?" # Get connector information /seatunnel-skill "List all available Kafka connector options" # Debug configuration issues /seatunnel-skill "Why is my job failing with OutOfMemoryError?" # Generate configuration examples /seatunnel-skill "Create a MySQL to Elasticsearch job config" ``` ## Requirements - [Claude Code](https://claude.ai/code) installed - Claude Code skills directory at `~/.claude/skills/` ================================================ FILE: docs/en/tools/x2seatunnel.md ================================================ --- sidebar_position: 4 --- # x2seatunnel x2seatunnel is a configuration converter that transforms DataX and other data integration tool configurations into SeaTunnel format. ## Supported Conversions | Source Format | Target Format | |--------------|---------------| | DataX JSON | SeaTunnel HOCON | ## Getting Started Refer to the [x2seatunnel repository](https://github.com/apache/seatunnel-tools/tree/main/x2seatunnel) for installation and usage instructions. ================================================ FILE: docs/en/transforms/common-options/common-options.md ================================================ --- sidebar_position: 1 --- # Transform Common Options > This is a process of intermediate conversion between the source and sink terminals,You can use sql statements to smoothly complete the conversion process :::caution warn The old configuration name `source_table_name`/`result_table_name` is deprecated, please migrate to the new name `plugin_input`/`plugin_output` as soon as possible. ::: | Name | Type | Required | Default | Description | |---------------|--------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | plugin_output | String | No | - | When `plugin_input` is not specified, the current plugin processes the data set `(dataset)` output by the previous plugin in the configuration file;
    When `plugin_input` is specified, the current plugin is processing the data set corresponding to this parameter. | | plugin_input | String | No | - | When `plugin_output` is not specified, the data processed by this plugin will not be registered as a data set that can be directly accessed by other plugins, or called a temporary table `(table)`;
    When `plugin_output` is specified, the data processed by this plugin will be registered as a data set `(dataset)` that can be directly accessed by other plugins, or called a temporary table `(table)` . The dataset registered here can be directly accessed by other plugins by specifying `plugin_input` . | ## Task Example ### Simple > This is the process of converting the data source to fake and write it to two different sinks, Detailed reference `transform` ```bash env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" c_timestamp = "timestamp" c_date = "date" c_map = "map" c_array = "array" c_decimal = "decimal(30, 8)" c_row = { c_row = { c_int = int } } } } } } transform { Sql { plugin_input = "fake" plugin_output = "fake1" # the query table name must same as field 'plugin_input' query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from dual" } # The SQL transform support base function and criteria operation # But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like } sink { Console { plugin_input = "fake1" } Console { plugin_input = "fake" } } ``` ================================================ FILE: docs/en/transforms/copy.md ================================================ # Copy > Copy transform plugin ## Description Copy a field to a new field. ## Options | name | type | required | default value | |--------|--------|----------|---------------| | fields | Object | yes | | ### fields [config] Specify the field copy relationship between input and output ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Example The data read from source is a table like this: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | We want copy fields `name`、`age` to a new fields `name1`、`name2`、`age1`, we can add `Copy` Transform like this ``` transform { Copy { plugin_input = "fake" plugin_output = "fake1" fields { name1 = name name2 = name age1 = age } } } ``` Then the data in result table `fake1` will like this | name | age | card | name1 | name2 | age1 | |----------|-----|------|----------|----------|------| | Joy Ding | 20 | 123 | Joy Ding | Joy Ding | 20 | | May Ding | 20 | 123 | May Ding | May Ding | 20 | | Kin Dom | 20 | 123 | Kin Dom | Kin Dom | 20 | | Joy Dom | 20 | 123 | Joy Dom | Joy Dom | 20 | ## Changelog ### new version - Add Copy Transform Connector - Support copy fields to a new fields ================================================ FILE: docs/en/transforms/data-validator.md ================================================ # DataValidator > Data validation transform plugin ## Description The DataValidator transform validates field values according to configured rules and handles validation failures based on the specified error handling strategy. It supports multiple validation rule types including null checks, range validation, length validation, and regex pattern matching. ## Options | name | type | required | default value | |-----------------|--------|----------|---------------| | row_error_handle_way| enum | no | FAIL | | row_error_handle_way.error_table | string | no | | | field_rules | array | yes | | ### row_error_handle_way [enum] Error handling strategy when validation fails: - `FAIL`: Fail the entire task when validation errors occur - `SKIP`: Skip invalid rows and continue processing - `ROUTE_TO_TABLE`: Route invalid data to a specified error table **Note**: `ROUTE_TO_TABLE` mode only works with sinks that support multiple tables. The sink must be capable of handling data routed to different table destinations. ### row_error_handle_way.error_table [string] Target table name for routing invalid data when `row_error_handle_way` is set to `ROUTE_TO_TABLE`. This parameter is required when using `ROUTE_TO_TABLE` mode. #### Error Table Schema When using `ROUTE_TO_TABLE` mode, DataValidator automatically creates an error table with a fixed schema to store validation failure data. The error table contains the following fields: | Field Name | Data Type | Description | |------------|-----------|-------------| | source_table_id | STRING | Source table identifier that identifies the originating table | | source_table_path | STRING | Source table path with complete table path information | | original_data | STRING | JSON representation of the original data containing the complete row that failed validation | | validation_errors | STRING | JSON array of validation error details containing all failed fields and error information | | create_time | TIMESTAMP | Creation time of the validation error | **Complete Error Table Record Example**: ```json { "source_table_id": "users_table", "source_table_path": "database.users", "original_data": "{\"id\": 123, \"name\": null, \"age\": 200, \"email\": \"invalid-email\"}", "validation_errors": "[{\"field_name\": \"name\", \"error_message\": \"Field 'name' cannot be null\"}, {\"field_name\": \"age\", \"error_message\": \"Field 'age' value 200 is not within range [0, 150]\"}, {\"field_name\": \"email\", \"error_message\": \"Field 'email' does not match pattern '^[\\\\w-\\\\.]+@([\\\\w-]+\\\\.)+[\\\\w-]{2,4}$'\"}]", "create_time": "2024-01-15T10:30:45" } ``` **Data Routing Mechanism**: - Data that passes validation maintains the original schema and is routed to the main output table - Data that fails validation is converted to the error table schema format above and routed to the specified error table - Each validation failure row generates one record in the error table, containing complete original data and detailed error information ### field_rules [array] Array of field validation rules. Each rule defines validation criteria for a specific field. #### Field Rule Structure Each field rule contains: - `field_name`: Name of the field to validate - `rules`: Array of validation rules to apply (nested format), or individual rule properties (flat format) #### Validation Rule Types ##### NOT_NULL Validates that a field value is not null. Parameters: - `rule_type`: "NOT_NULL" - `custom_message` (optional): Custom error message ##### RANGE Validates that a numeric value is within a specified range. Parameters: - `rule_type`: "RANGE" - `min_value` (optional): Minimum allowed value - `max_value` (optional): Maximum allowed value - `min_inclusive` (optional): Whether minimum value is inclusive (default: true) - `max_inclusive` (optional): Whether maximum value is inclusive (default: true) - `custom_message` (optional): Custom error message ##### LENGTH Validates the length of string, array, or collection values. Parameters: - `rule_type`: "LENGTH" - `min_length` (optional): Minimum allowed length - `max_length` (optional): Maximum allowed length - `exact_length` (optional): Exact required length - `custom_message` (optional): Custom error message ##### REGEX Validates that a string value matches a regular expression pattern. Parameters: - `rule_type`: "REGEX" - `pattern`: Regular expression pattern (required) - `case_sensitive` (optional): Whether pattern matching is case sensitive (default: true) - `custom_message` (optional): Custom error message ##### UDF (User Defined Function) Validates field values using custom business logic implemented as a User Defined Function. Parameters: - `rule_type`: "UDF" - `function_name`: Name of the UDF function to execute (required) - `custom_message` (optional): Custom error message **Built-in UDF Functions:** - `EMAIL`: Validates email addresses using practical validation rules based on OWASP recommendations **Creating Custom UDF Functions:** To create a custom UDF function: 1. Implement the `DataValidatorUDF` interface 2. Use `@AutoService(DataValidatorUDF.class)` annotation 3. Provide a unique `functionName()` 4. Implement the `validate()` method with your custom logic ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Examples ### Example 1: Basic Validation with FAIL Mode ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "age" rule_type = "RANGE" min_value = 0 max_value = 150 }, { field_name = "email" rule_type = "REGEX" pattern = "^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{2,4}$" } ] } } ``` ### Example 2: Validation with SKIP Mode ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "SKIP" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "name" rule_type = "LENGTH" min_length = 2 max_length = 50 } ] } } ``` ### Example 3: Validation with ROUTE_TO_TABLE Mode ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "ROUTE_TO_TABLE" row_error_handle_way.error_table = "error_data" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "age" rule_type = "RANGE" min_value = 0 max_value = 150 } ] } } ``` **Note**: When using `ROUTE_TO_TABLE`, ensure your sink connector supports multiple tables. Valid data will be sent to the main output table, while invalid data will be routed to the specified error table. In this example: - Data that passes validation will maintain the original schema (containing name, age, etc. fields) and be sent to the main output table - Data that fails validation will be converted to the error table schema (containing source_table_id, source_table_path, original_data, validation_errors, create_time fields) and routed to the "error_data" table ### Example 4: Nested Rules Format ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "name" rules = [ { rule_type = "NOT_NULL" custom_message = "Name is required" }, { rule_type = "LENGTH" min_length = 2 max_length = 50 custom_message = "Name must be between 2 and 50 characters" } ] } ] } } ``` ### Example 5: Email Validation using Built-in UDF ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "email" rule_type = "UDF" function_name = "EMAIL" custom_message = "Invalid email address format" } ] } } ``` ## UDF Development Guide ### Creating Custom UDF Functions To create a custom validation UDF function, follow these steps: #### 1. Implement the DataValidatorUDF Interface ```java package com.example.validator; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.transform.validator.ValidationContext; import org.apache.seatunnel.transform.validator.ValidationResult; import org.apache.seatunnel.transform.validator.udf.DataValidatorUDF; import com.google.auto.service.AutoService; @AutoService(DataValidatorUDF.class) public class PhoneValidator implements DataValidatorUDF { @Override public String functionName() { return "PHONE_VALIDATOR"; } @Override public ValidationResult validate( Object value, SeaTunnelDataType dataType, ValidationContext context) { if (value == null) { return ValidationResult.success(); } String phone = value.toString().trim(); // Custom phone validation logic if (phone.matches("^\\+?[1-9]\\d{1,14}$")) { return ValidationResult.success(); } else { return ValidationResult.failure("Invalid phone number format: " + phone); } } @Override public String getDescription() { return "Validates international phone number format"; } } ``` #### 2. Register the UDF The UDF is automatically registered using the `@AutoService(DataValidatorUDF.class)` annotation. This uses Java's ServiceLoader mechanism to discover and load UDF implementations at runtime. #### 3. Package and Deploy 1. Compile your UDF class and package it into a JAR file 2. Place the JAR file in the SeaTunnel classpath 3. The UDF will be automatically discovered and available for use **Usage Example**: ```hocon { field_name = "email" rule_type = "UDF" function_name = "EMAIL" custom_message = "Please provide a valid email address" } ``` ================================================ FILE: docs/en/transforms/define-sink-type.md ================================================ # Define Sink Type > Define sink type transform plugin ## Description Used to define the storage type of sink field. This is effective when the savemode enables automatic table creation. ## Options | name | type | required | default value | Description | |:-------:|---------------------------|----------|---------------|------------------------------------------------------------------------| | columns | list> | yes | | The columns to be defined, the name and type of the column must be set | ## Examples ### Define sink columns type for savemode ``` transform { DefineSinkType { columns = [ { column = "c1" type = "nvarchar2(10)" } { column = "c2" type = "datetime(6)" } { column = "c3" type = "your target type" } ] } } ``` ================================================ FILE: docs/en/transforms/dynamic-compile.md ================================================ # DynamicCompile > DynamicCompile transform plugin ## Description :::tip important clause You need to ensure the security of your service and prevent attackers from uploading destructive code ::: Provide a programmable way to process rows, allowing users to customize any business behavior, even RPC requests based on existing row fields as parameters, or to expand fields by retrieving associated data from other data sources. To distinguish businesses, you can also define multiple transforms to combine, If the conversion is too complex, it may affect performance ## Options | name | type | required | default value | |------------------|--------|----------|---------------| | source_code | string | no | | | compile_language | Enum | yes | | | compile_pattern | Enum | no | SOURCE_CODE | | absolute_path | string | no | | ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ### compile_language [Enum] Some syntax in Java may not be supported, please refer https://github.com/janino-compiler/janino GROOVY,JAVA,SCALA(Only Support Zeta) **Note**: SCALA support uses the Scala REPL for dynamic compilation and requires proper Scala syntax. ### compile_pattern [Enum] SOURCE_CODE,ABSOLUTE_PATH If it is a SOURCE-CODE enumeration; the SOURCE-CODE attribute is required, and the ABSOLUTE_PATH enumeration;ABSOLUTE_PATH attribute is required ### absolute_path [string] The absolute path of Java or Groovy files on the server ### source_code [string] The source code. #### Details about the source code In the source code, you must implement two method: - `Column[] getInlineOutputColumns(CatalogTable inputCatalogTable)` - `Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow)` `getInlineOutputColumns` method, input parameter is `CatalogTable`, return type is `Column[]`. you can get the current table's schema from `CatalogTable`. if the return column exist in current schema, then it will overwrite by returned value (field type, comment, ...), if it's a new column, it will add into current schema. `getInlineOutputFieldValues` method, input parameter is `SeaTunnelRowAccessor`, return type is `Object[]` You can get the record from `SeaTunnelRowAccessor`, do you own customized data process logical. The return `Object[]` array length should match with `getInlineOutputColumns` method result's length. and the order also need be match. If there are third-party dependency packages, please place them in ${SEATUNNEL_HOME}/lib, if you use spark or flink, you need to put it under the libs of the corresponding service. You need restart the server to load the lib file. ## Example The data read from source is a table like this: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 30 | 123 | | Joy Dom | 30 | 123 | Use this DynamicCompile to add a new column `compile_language`, and update the `age` field by its original value (if age = 20, update to 40) - use groovy ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "groovy_out" compile_language="GROOVY" compile_pattern="SOURCE_CODE" source_code=""" import org.apache.seatunnel.api.table.catalog.Column import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor import org.apache.seatunnel.api.table.catalog.CatalogTable import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.type.*; import java.util.ArrayList; class demo { public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { PhysicalColumn col1 = PhysicalColumn.of( "compile_language", BasicType.STRING_TYPE, 10L, true, "", ""); PhysicalColumn col2 = PhysicalColumn.of( "age", BasicType.INT_TYPE, 0L, false, false, "" ); return new Column[]{ col1, col2 }; } public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { Object[] fieldValues = new Object[2]; // get age Object ageField = inputRow.getField(1); fieldValues[0] = "GROOVY"; if (Integer.parseInt(ageField.toString()) == 20) { fieldValues[1] = 40; } else { fieldValues[1] = ageField; } return fieldValues; } };""" } } ``` - use java ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "java_out" compile_language="JAVA" compile_pattern="SOURCE_CODE" source_code=""" import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor; import org.apache.seatunnel.api.table.catalog.*; import org.apache.seatunnel.api.table.type.*; import java.util.ArrayList; public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { PhysicalColumn col1 = PhysicalColumn.of( "compile_language", BasicType.STRING_TYPE, 10L, true, "", ""); PhysicalColumn col2 = PhysicalColumn.of( "age", BasicType.INT_TYPE, 0L, false, false, "" ); return new Column[]{ col1, col2 }; } public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { Object[] fieldValues = new Object[2]; // get age Object ageField = inputRow.getField(1); fieldValues[0] = "JAVA"; if (Integer.parseInt(ageField.toString()) == 20) { fieldValues[1] = 40; } else { fieldValues[1] = ageField; } return fieldValues; } """ } } ``` - use absolute path to read code ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "groovy_out" compile_language="GROOVY" compile_pattern="ABSOLUTE_PATH" absolute_path="""/tmp/GroovyFile""" } } ``` Then the data in result table `groovy_out` will like this | name | age | card | compile_language | |----------|-----|------|------------------| | Joy Ding | 40 | 123 | GROOVY | | May Ding | 40 | 123 | GROOVY | | Kin Dom | 30 | 123 | GROOVY | | Joy Dom | 30 | 123 | GROOVY | Then the data in result table `java_out` will like this | name | age | card | compile_language | |----------|-----|------|------------------| | Joy Ding | 40 | 123 | JAVA | | May Ding | 40 | 123 | JAVA | | Kin Dom | 30 | 123 | JAVA | | Joy Dom | 30 | 123 | JAVA | - use scala ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "scala_out" compile_language="SCALA" compile_pattern="SOURCE_CODE" source_code=""" import org.apache.seatunnel.api.table.catalog.Column import org.apache.seatunnel.api.table.catalog.CatalogTable import org.apache.seatunnel.api.table.catalog.PhysicalColumn import org.apache.seatunnel.api.table.`type`.SeaTunnelRowAccessor import org.apache.seatunnel.api.table.`type`.BasicType import java.util.ArrayList class ScalaDemo { def getInlineOutputColumns(inputCatalogTable: CatalogTable): Array[Column] = { val columns = new ArrayList[Column]() val destColumn = PhysicalColumn.of( "compile_language", BasicType.STRING_TYPE, 10L, true, "", "" ) columns.add(destColumn) columns.toArray(new Array[Column](0)) } def getInlineOutputFieldValues(inputRow: SeaTunnelRowAccessor): Array[Object] = { Array[Object]("SCALA") } } """ } } ``` More complex examples can be referred to https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/dynamic_compile/conf ## Changelog ================================================ FILE: docs/en/transforms/embedding.md ================================================ # Embedding > Embedding Transform Plugin ## Description The `Embedding` transform plugin leverages embedding models to convert text and multimodal data into vectorized representations. This transformation can be applied to various fields including text, images, and videos. The plugin supports multiple model providers and can be integrated with different API endpoints. > **Important Note:** The current embedding precision only supports float32 format. ## Options | Name | Type | Required | Default Value | Description | |--------------------------------|--------|----------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | model_provider | enum | yes | - | The model provider for embedding. Options may include `AMAZON`, `QIANFAN`, `OPENAI`, etc. | | api_key | string | yes | - | The API key required to authenticate with the embedding service. | | secret_key | string | yes | - | The secret key required for additional authentication with the embedding service. | | aws_region | string | no | | AWS Region. Required for use Amazon Bedrock model. | | single_vectorized_input_number | int | no | 1 | The number of inputs vectorized in one request. Default is 1. | | vectorization_fields | map | yes | - | A mapping between input fields and their corresponding output vector fields. | | model | string | yes | - | The specific model to use for embedding (e.g: `text-embedding-3-small` for OPENAI). | | api_path | string | no | - | The API endpoint for the embedding service. Typically provided by the model provider. | | dimension | int | no | - | TThe vector dimension defaults to 2048. The Embedding-3 model supports custom vector dimensions, and it is recommended to choose dimensions of 256, 512, 1024, or 2048. | | oauth_path | string | no | - | The API endpoint for the oauth service. | | custom_config | map | no | | Custom configurations for the model. | | custom_response_parse | string | no | | Specifies how to parse the response from the model using JsonPath. Example: `$.choices[*].message.content`. | | custom_request_headers | map | no | | Custom headers for the request to the model. | | custom_request_body | map | no | | Custom body for the request. Supports placeholders like `${model}`, `${input}`. | ## Precision Support **Important:** The current version of the Embedding plugin only supports **float32** precision for vector data. - All generated embedding vectors will be stored in float32 format - If your model or API returns other precision formats (such as float64), the plugin will automatically convert them to float32 ### model_provider The providers for generating embeddings include common options such as `AMAZON`, `DOUBAO`, `QIANFAN`, and `OPENAI`. Additionally, you can choose `CUSTOM` to implement requests and retrievals for custom embedding models. ### api_key The API key for authenticating requests to the embedding service. This is typically provided by the model provider when you register for their service. ### secret_key The secret key used for additional authentication. Some providers may require this for secure API requests. ### single_vectorized_input_number Specifies how many inputs are processed in a single vectorization request. The default is 1. Adjust based on your processing capacity and the model provider's API limitations. ### vectorization_fields A mapping between input fields and their respective output vector fields. This allows the plugin to understand which fields to vectorize and how to store the resulting vectors. The plugin supports multimodal data by allowing you to specify the modality type for each field. **Basic Text Vectorization:** ```hocon vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } ``` **Multimodal Vectorization:** ```hocon vectorization_fields { # Basic text field text_vector = text_field # Explicit modality type configuration product_image_vector = { field = product_image_url modality = jpeg format = url } # Auto-detect modality type (based on file suffix) thumbnail_vector = { field = thumbnail_image # If value is "image.png", auto-detects as PNG modality format = url } # Video field configuration demo_video_vector = { field = product_video_url modality = mp4 format = url } # Binary data configuration binary_image_vector = { field = image_data modality = jpeg format = binary } } ``` **Field Specification Formats:** **Supported Modality Types:** - **Images:** `jpeg` (jpg, jpeg), `png` (png, apng), `gif`, `webp`, `bmp` (bmp, dib), `tiff` (tiff, tif), `ico`, `icns`, `sgi`, `jpeg2000` (j2c, j2k, jp2, jpc, jpf, jpx) - **Videos:** `mp4`, `avi`, `mov` - **Text:** `text` (default) **Payload Formats:** - `text` - Text format (default) - `url` - URL format - `binary` - Binary data format **Automatic Modality Detection:** When `modality` is not explicitly specified and `format` is not `binary`, the system automatically detects the modality type based on the file suffix of the field value: > **Important:** When using multimodal fields (image or video), ensure your model provider supports multimodal embedding. Image and video fields must contain valid URLs or binary data. Currently, `DOUBAO` provider supports multimodal data processing. ### model The specific embedding model to use. This depends on the `model_provider`. For example, if using OPENAI, you might specify `text-embedding-3-small`. ### api_path The API endpoint to use for making requests to the embedding service. This might vary based on the provider and model used. Generally, this is provided by the model provider. ### oauth_path The API endpoint for the oauth service. Get certification information. This might vary based on the provider and model used. Generally, this is provided by the model provider. ### custom_config The `custom_config` option allows you to provide additional custom configurations for the model. This is a map where you can define various settings that might be required by the specific model you're using. ### custom_response_parse The `custom_response_parse` option allows you to specify how to parse the model's response. You can use JsonPath to extract the specific data you need from the response. For example, by using `$.data[*].embedding`, you can extract the `embedding` field values from the following JSON and obtain a `List` of nested `List` results. For more details on using JsonPath, please refer to the [JsonPath Getting Started guide](https://github.com/json-path/JsonPath?tab=readme-ov-file#getting-started). ```json { "object": "list", "data": [ { "object": "embedding", "index": 0, "embedding": [ -0.006929283495992422, -0.005336422007530928, -0.00004547132266452536, -0.024047505110502243 ] } ], "model": "text-embedding-3-small", "usage": { "prompt_tokens": 5, "total_tokens": 5 } } ``` ### custom_request_headers The `custom_request_headers` option allows you to define custom headers that should be included in the request sent to the model's API. This is useful if the API requires additional headers beyond the standard ones, such as authorization tokens, content types, etc. ### custom_request_body The `custom_request_body` option supports placeholders: - `${model}`: Placeholder for the model name. - `${input}`: Placeholder to determine input value and define request body request type based on the type of body value. Example: `["${input}"]` -> ["input"] (list) ### common options Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details. ## Example Configurations ### Basic Text Embedding ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { book_id = "int" book_name = "string" book_intro = "string" author_biography = "string" } } rows = [ {fields = [1, "To Kill a Mockingbird", "Set in the American South during the 1930s, To Kill a Mockingbird tells the story of young Scout Finch and her brother, Jem, who are growing up in a world of racial inequality and injustice. Their father, Atticus Finch, is a lawyer who defends a black man falsely accused of raping a white woman, teaching his children valuable lessons about morality, courage, and empathy.", "Harper Lee (1926–2016) was an American novelist best known for To Kill a Mockingbird, which won the Pulitzer Prize in 1961. Lee was born in Monroeville, Alabama, and the town served as inspiration for the fictional Maycomb in her novel. Despite the success of her book, Lee remained a private person and published only one other novel, Go Set a Watchman, which was written before To Kill a Mockingbird but released in 2015 as a sequel." ], kind = INSERT} {fields = [2, "1984", "1984 is a dystopian novel set in a totalitarian society governed by Big Brother. The story follows Winston Smith, a man who works for the Party rewriting history. Winston begins to question the Party’s control and seeks truth and freedom in a society where individuality is crushed. The novel explores themes of surveillance, propaganda, and the loss of personal autonomy.", "George Orwell (1903–1950) was the pen name of Eric Arthur Blair, an English novelist, essayist, journalist, and critic. Orwell is best known for his works 1984 and Animal Farm, both of which are critiques of totalitarian regimes. His writing is characterized by lucid prose, awareness of social injustice, opposition to totalitarianism, and support of democratic socialism. Orwell’s work remains influential, and his ideas have shaped contemporary discussions on politics and society." ], kind = INSERT} {fields = [3, "Pride and Prejudice", "Pride and Prejudice is a romantic novel that explores the complex relationships between different social classes in early 19th century England. The story centers on Elizabeth Bennet, a young woman with strong opinions, and Mr. Darcy, a wealthy but reserved gentleman. The novel deals with themes of love, marriage, and societal expectations, offering keen insights into human behavior.", "Jane Austen (1775–1817) was an English novelist known for her sharp social commentary and keen observations of the British landed gentry. Her works, including Sense and Sensibility, Emma, and Pride and Prejudice, are celebrated for their wit, realism, and biting critique of the social class structure of her time. Despite her relatively modest life, Austen’s novels have gained immense popularity, and she is considered one of the greatest novelists in the English language." ], kind = INSERT} {fields = [4, "The Great GatsbyThe Great Gatsby", "The Great Gatsby is a novel about the American Dream and the disillusionment that can come with it. Set in the 1920s, the story follows Nick Carraway as he becomes entangled in the lives of his mysterious neighbor, Jay Gatsby, and the wealthy elite of Long Island. Gatsby's obsession with the beautiful Daisy Buchanan drives the narrative, exploring themes of wealth, love, and the decay of the American Dream.", "F. Scott Fitzgerald (1896–1940) was an American novelist and short story writer, widely regarded as one of the greatest American writers of the 20th century. Born in St. Paul, Minnesota, Fitzgerald is best known for his novel The Great Gatsby, which is often considered the quintessential work of the Jazz Age. His works often explore themes of youth, wealth, and the American Dream, reflecting the turbulence and excesses of the 1920s." ], kind = INSERT} {fields = [5, "Moby-Dick", "Moby-Dick is an epic tale of obsession and revenge. The novel follows the journey of Captain Ahab, who is on a relentless quest to kill the white whale, Moby Dick, that once maimed him. Narrated by Ishmael, a sailor aboard Ahab’s ship, the story delves into themes of fate, humanity, and the struggle between man and nature. The novel is also rich with symbolism and philosophical musings.", "Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors." ], kind = INSERT} ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = QIANFAN model = bge_large_en api_key = xxxxxxxxxx secret_key = xxxxxxxxxx api_path = xxxxxxxxxx vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } plugin_output = "embedding_output" } } sink { Assert { plugin_input = "embedding_output" rules = { field_rules = [ { field_name = book_id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ### Multimodal Embedding (Volcengine Doubao) Multimodal Embedding supports input as accessible URL or Binary data formats to process multimodal data. #### URL ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { id = "int" product_name = "string" description = "string" product_image_url = "string" product_video_url = "string" thumbnail_image = "string" promotional_video = "string" category = "string" price = "decimal(10,2)" created_at = "timestamp" } } rows = [ { fields = [ 1, "iPhone 15 Pro", "Latest iPhone with advanced camera system and A17 Pro chip", "https://example.com/images/iphone15pro.jpg", "https://example.com/videos/iphone15pro_demo.mp4", "https://example.com/thumbnails/iphone15pro_thumb.png", "https://example.com/videos/iphone15pro_promo.mov", "Electronics", 999.99, "2024-01-15T10:30:00" ], kind = INSERT }, { fields = [ 2, "MacBook Air M3", "Ultra-thin laptop with M3 chip for incredible performance", "https://example.com/images/macbook_air_m3.jpeg", "https://example.com/videos/macbook_air_review.avi", "https://example.com/thumbnails/macbook_thumb.webp", "https://example.com/videos/macbook_commercial.mp4", "Computers", 1299.99, "2024-02-20T14:15:00" ], kind = INSERT } ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = DOUBAO model = "doubao-embedding-vision" api_key = "your-api-key" api_path = "https://ark.cn-beijing.volces.com/api/v3/embeddings/multimodal" single_vectorized_input_number = 1 vectorization_fields { # Text field - defaults to text modality description_vector = description product_image_vector = { field = product_image_url modality = jpeg format = url } thumbnail_vector = { field = thumbnail_image # If value is "thumb.png", auto-detects as PNG format = url } demo_video_vector = { field = product_video_url modality = mp4 format = url } promo_video_vector = { field = promotional_video # If value is "promo.mov", auto-detects as MOV format = url } # Mixed content - product name product_name_vector = product_name } plugin_output = "multimodal_embedding_output" } } sink { Assert { plugin_input = "multimodal_embedding_output" rules = { field_rules = [ { field_name = id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = description_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = product_image_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = thumbnail_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = demo_video_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` #### Binary ```hocon env { job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" binary_complete_file_mode = false binary_chunk_size = 1024 plugin_output = "binary_source" } } transform { Embedding { plugin_input = "binary_source" model_provider = DOUBAO model = "doubao-embedding-vision-250615" api_key = "test-api-key" api_path = "http://mockserver:1080/api/v3/embeddings/multimodal" single_vectorized_input_number = 1 vectorization_fields = { image_embedding = { field = "data" modality = "jpeg" format = "binary" } } plugin_output = "binary_embedding_output" } } sink { Assert { plugin_input = "binary_embedding_output" rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 1 } ], field_rules = [ { field_name = image_embedding field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = relativePath field_type = string field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ### Customize the embedding model ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { book_id = "int" book_name = "string" book_intro = "string" author_biography = "string" } } rows = [ {fields = [1, "To Kill a Mockingbird", "Set in the American South during the 1930s, To Kill a Mockingbird tells the story of young Scout Finch and her brother, Jem, who are growing up in a world of racial inequality and injustice. Their father, Atticus Finch, is a lawyer who defends a black man falsely accused of raping a white woman, teaching his children valuable lessons about morality, courage, and empathy.", "Harper Lee (1926–2016) was an American novelist best known for To Kill a Mockingbird, which won the Pulitzer Prize in 1961. Lee was born in Monroeville, Alabama, and the town served as inspiration for the fictional Maycomb in her novel. Despite the success of her book, Lee remained a private person and published only one other novel, Go Set a Watchman, which was written before To Kill a Mockingbird but released in 2015 as a sequel." ], kind = INSERT} {fields = [2, "1984", "1984 is a dystopian novel set in a totalitarian society governed by Big Brother. The story follows Winston Smith, a man who works for the Party rewriting history. Winston begins to question the Party’s control and seeks truth and freedom in a society where individuality is crushed. The novel explores themes of surveillance, propaganda, and the loss of personal autonomy.", "George Orwell (1903–1950) was the pen name of Eric Arthur Blair, an English novelist, essayist, journalist, and critic. Orwell is best known for his works 1984 and Animal Farm, both of which are critiques of totalitarian regimes. His writing is characterized by lucid prose, awareness of social injustice, opposition to totalitarianism, and support of democratic socialism. Orwell’s work remains influential, and his ideas have shaped contemporary discussions on politics and society." ], kind = INSERT} {fields = [3, "Pride and Prejudice", "Pride and Prejudice is a romantic novel that explores the complex relationships between different social classes in early 19th century England. The story centers on Elizabeth Bennet, a young woman with strong opinions, and Mr. Darcy, a wealthy but reserved gentleman. The novel deals with themes of love, marriage, and societal expectations, offering keen insights into human behavior.", "Jane Austen (1775–1817) was an English novelist known for her sharp social commentary and keen observations of the British landed gentry. Her works, including Sense and Sensibility, Emma, and Pride and Prejudice, are celebrated for their wit, realism, and biting critique of the social class structure of her time. Despite her relatively modest life, Austen’s novels have gained immense popularity, and she is considered one of the greatest novelists in the English language." ], kind = INSERT} {fields = [4, "The Great GatsbyThe Great Gatsby", "The Great Gatsby is a novel about the American Dream and the disillusionment that can come with it. Set in the 1920s, the story follows Nick Carraway as he becomes entangled in the lives of his mysterious neighbor, Jay Gatsby, and the wealthy elite of Long Island. Gatsby's obsession with the beautiful Daisy Buchanan drives the narrative, exploring themes of wealth, love, and the decay of the American Dream.", "F. Scott Fitzgerald (1896–1940) was an American novelist and short story writer, widely regarded as one of the greatest American writers of the 20th century. Born in St. Paul, Minnesota, Fitzgerald is best known for his novel The Great Gatsby, which is often considered the quintessential work of the Jazz Age. His works often explore themes of youth, wealth, and the American Dream, reflecting the turbulence and excesses of the 1920s." ], kind = INSERT} {fields = [5, "Moby-Dick", "Moby-Dick is an epic tale of obsession and revenge. The novel follows the journey of Captain Ahab, who is on a relentless quest to kill the white whale, Moby Dick, that once maimed him. Narrated by Ishmael, a sailor aboard Ahab’s ship, the story delves into themes of fate, humanity, and the struggle between man and nature. The novel is also rich with symbolism and philosophical musings.", "Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors." ], kind = INSERT} ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = CUSTOM model = text-embedding-3-small api_key = xxxxxxxx api_path = "http://mockserver:1080/v1/doubao/embedding" single_vectorized_input_number = 2 vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } custom_config={ custom_response_parse = "$.data[*].embedding" custom_request_headers = { "Content-Type"= "application/json" "Authorization"= "Bearer xxxxxxx } custom_request_body ={ modelx = "${model}" inputx = ["${input}"] } } plugin_output = "embedding_output_1" } } sink { Assert { plugin_input = "embedding_output_1" rules = { field_rules = [ { field_name = book_id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ================================================ FILE: docs/en/transforms/encrypt.md ================================================ # Encrypt > Encrypt transform plugin ## Description The Encrypt transform plugin is used to encrypt or decrypt specified fields in records using a symmetric encryption algorithm. ## Options | name | type | required | default value | description | |-------------|--------|----------|---------------|-----------------------------------| | `fields` | Array | Yes | - | List of fields to encrypt/decrypt | | `algorithm` | String | No | `AES_GCM` | Encryption algorithm | | `key` | String | Yes | - | Base64-encoded encryption key | | `mode` | String | No | `ENCRYPT` | `ENCRYPT`or `DECRYPT` | ### algorithm [string] Encryption algorithm used by this transform. Supported values: - `AES_GCM`: default, AES in GCM mode with authentication tag - `AES_CBC`: AES in CBC mode with PKCS5 padding `AES_GCM` provides authenticated encryption and is recommended for better security. If not specified, `AES_GCM` is used by default. ### key [string] The encryption key must be provided in Base64-encoded format. Make sure the key length matches the requirements of the selected algorithm. For both `AES_GCM` and `AES_CBC`, valid key lengths are 16, 24, or 32 bytes (corresponding to AES-128, AES-192, or AES-256). **Example** - `base64:AAAAAAAAAAAAAAAAAAAAAA==` - `AAAAAAAAAAAAAAAAAAAAAA==` ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options.md) for details ## Example ``` transform { FieldEncrypt { fields = ["name"] key = "base64:AAAAAAAAAAAAAAAAAAAAAA==" algorithm = "AES_CBC" mode = "ENCRYPT" } } ``` ``` transform { FieldEncrypt { fields = ["name"] key = "base64:AAAAAAAAAAAAAAAAAAAAAA==" algorithm = "AES_CBC" mode = "DECRYPT" } } ``` ================================================ FILE: docs/en/transforms/field-mapper.md ================================================ # FieldMapper > FieldMapper transform plugin ## Description Add input schema and output schema mapping. ## Options | name | type | required | default value | |--------------|--------|----------|---------------| | field_mapper | Object | yes | | ### field_mapper [config] Specify the field mapping relationship between input and output ### common options [config] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details. ## Example The data read from source is a table like this: | id | name | age | card | |----|----------|-----|------| | 1 | Joy Ding | 20 | 123 | | 2 | May Ding | 20 | 123 | | 3 | Kin Dom | 20 | 123 | | 4 | Joy Dom | 20 | 123 | We want to delete `age` field and update the field order to `id`, `card`, `name` and rename `name` to `new_name`. We can add `FieldMapper` transform like this ``` transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { id = id card = card name = new_name } } } ``` Then the data in result table `fake1` will like this | id | card | new_name | |----|------|----------| | 1 | 123 | Joy Ding | | 2 | 123 | May Ding | | 3 | 123 | Kin Dom | | 4 | 123 | Joy Dom | ## Changelog ### new version - Add Copy Transform Connector ================================================ FILE: docs/en/transforms/field-rename.md ================================================ # FieldRename > FieldRename transform plugin ## Description FieldRename transform plugin for rename field name. ## Options | name | type | required | default value | Description | |:-----------------------:|--------|----------|---------------|-----------------------------------------------------------------------------------------------------------------------| | convert_case | string | no | | The case conversion type. The options can be `UPPER`, `LOWER` | | prefix | string | no | | The prefix to be added to the field name | | suffix | string | no | | The suffix to be added to the field name | | replacements_with_regex | array | no | | The array of replacement rules. Each rule is a map with `replace_from`, `replace_to`, and optional `is_regex` (default `true`). When `is_regex=false`, `replace_from` is treated as an exact field name (full match). | | specific | array | no | | Specific rename rules. Each rule is a map with `field_name` and `target_name`. When matched, it will rename the field directly and skip other rename rules. | ## Examples ### Convert field to uppercase ``` env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_shop", "source.user_order"] url = "jdbc:mysql://localhost:3306/source" } } transform { FieldRename { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" convert_case = "UPPER" prefix = "F_" suffix = "_S" replacements_with_regex = [ { replace_from = "create_time" replace_to = "SOURCE_CREATE_TIME" } ] } } sink { Jdbc { plugin_input = "trans_result" driver="oracle.jdbc.OracleDriver" url="jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user="myuser" password="mypwd" generate_sink_sql = true database = "ORCLCDB" table = "${database_name}.${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ### Rename specific fields ``` transform { FieldRename { plugin_input = "input" plugin_output = "output" specific = [ { field_name = "InvoiceNum", target_name = "invoice_num" } ] } } ``` ### Convert field name to lowercase ``` env { parallelism = 1 job.mode = "STREAMING" } source { Oracle-CDC { plugin_output = "customers_oracle_cdc" url = "jdbc:oracle:thin:@localhost:1521/ORCLCDB" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["SOURCE.USER_SHOP", "SOURCE.USER_ORDER"] } } transform { FieldRename { plugin_input = "customers_oracle_cdc" plugin_output = "trans_result" convert_case = "LOWER" prefix = "f_" suffix = "_s" replacements_with_regex = [ { replace_from = "CREATE_TIME" replace_to = "source_create_time" } ] } } sink { Jdbc { plugin_input = "trans_result" url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = "${schema_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/en/transforms/filter-rowkind.md ================================================ # FilterRowKind > FilterRowKind transform plugin ## Description Filter the data by RowKind ## Options | name | type | required | default value | |---------------|-------|----------|---------------| | include_kinds | array | yes | | | exclude_kinds | array | yes | | ### include_kinds [array] The row kinds to include ### exclude_kinds [array] The row kinds to exclude. You can only config one of `include_kinds` and `exclude_kinds`. ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Examples The RowKink of the data generate by FakeSource is `INSERT`, If we use `FilterRowKink` transform and exclude the `INSERT` data, we will write zero rows into sink. ```yaml env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" } } } } transform { FilterRowKind { plugin_input = "fake" plugin_output = "fake1" exclude_kinds = ["INSERT"] } } sink { Console { plugin_input = "fake1" } } ``` ================================================ FILE: docs/en/transforms/filter.md ================================================ # Filter > Filter transform plugin ## Description Filter the field. ## Options | name | type | required | default value | |----------------|-------|----------|---------------| | include_fields | array | no | | | exclude_fields | array | no | | Notice, you must set one and only one of `include_fields` and `exclude_fields` properties ### include_fields [array] The list of fields that need to be kept. Fields not in the list will be deleted. ### exclude_fields [array] The list of fields that need to be deleted. Fields not in the list will be kept. ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Example The data read from source is a table like this: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | we want to keep the field named `name`, `card`, we can add a `Filter` Transform like below: ``` transform { Filter { plugin_input = "fake" plugin_output = "fake1" include_fields = [name, card] } } ``` Or we can delete the field named `age` by adding a `Filter` Transform with `exclude_fields` field set like below: ``` transform { Filter { plugin_input = "fake" plugin_output = "fake1" exclude_fields = [age] } } ``` It is useful when you want to delete a small number of fields from a large table with tons of fields. Then the data in result table `fake1` will like this | name | card | |----------|------| | Joy Ding | 123 | | May Ding | 123 | | Kin Dom | 123 | | Joy Dom | 123 | ## Changelog ### new version - Add Filter Transform Connector ================================================ FILE: docs/en/transforms/jsonpath.md ================================================ # JsonPath > JsonPath transform plugin ## Description > Support use jsonpath select data ## Options | name | type | required | default value | |----------------------|-------|----------|---------------| | columns | Array | Yes | | | row_error_handle_way | Enum | No | FAIL | ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ### row_error_handle_way [Enum] This option is used to specify the processing method when an error occurs in the row, the default value is `FAIL`. - FAIL: When `FAIL` is selected, data format error will block and an exception will be thrown. - SKIP: When `SKIP` is selected, data format error will skip this row data. ### columns [array] #### option | name | type | required | default value | |-------------------------|--------|----------|---------------| | src_field | String | Yes | | | dest_field | String | Yes | | | path | String | Yes | | | dest_type | String | No | String | | column_error_handle_way | Enum | No | | #### src_field > the json source field you want to parse Support SeatunnelDateType * STRING * BYTES * ARRAY * MAP * ROW #### dest_field > after use jsonpath output field #### dest_type > the type of dest field #### path > Jsonpath #### column_error_handle_way [Enum] This option is used to specify the processing method when an error occurs in the column. - FAIL: When `FAIL` is selected, data format error will block and an exception will be thrown. - SKIP: When `SKIP` is selected, data format error will skip this column data. - SKIP_ROW: When `SKIP_ROW` is selected, data format error will skip this row data. ## Read Json Example The data read from source is a table like this json: ```json { "data": { "c_string": "this is a string", "c_boolean": true, "c_integer": 42, "c_float": 3.14, "c_double": 3.14, "c_decimal": 10.55, "c_date": "2023-10-29", "c_datetime": "16:12:43.459", "c_array":["item1", "item2", "item3"], "c_map_array": [{"c_string_1":"c_string_1","c_string_2":"c_string_2","c_string_3":"c_string_3"},{"c_string_1":"c_string_1","c_string_2":"c_string_2","c_string_3":"c_string_3"}] } } ``` Assuming we want to use JsonPath to extract properties. ```json transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" columns = [ { "src_field" = "data" "path" = "$.data.c_string" "dest_field" = "c1_string" }, { "src_field" = "data" "path" = "$.data.c_boolean" "dest_field" = "c1_boolean" "dest_type" = "boolean" }, { "src_field" = "data" "path" = "$.data.c_integer" "dest_field" = "c1_integer" "dest_type" = "int" }, { "src_field" = "data" "path" = "$.data.c_float" "dest_field" = "c1_float" "dest_type" = "float" }, { "src_field" = "data" "path" = "$.data.c_double" "dest_field" = "c1_double" "dest_type" = "double" }, { "src_field" = "data" "path" = "$.data.c_decimal" "dest_field" = "c1_decimal" "dest_type" = "decimal(4,2)" }, { "src_field" = "data" "path" = "$.data.c_date" "dest_field" = "c1_date" "dest_type" = "date" }, { "src_field" = "data" "path" = "$.data.c_datetime" "dest_field" = "c1_datetime" "dest_type" = "time" }, { "src_field" = "data" "path" = "$.data.c_array" "dest_field" = "c1_array" "dest_type" = "array" }, { "src_field" = "data" "path" = "$.data.c_map_array" "dest_field" = "c1_map_array" "dest_type" = "array>" } ] } } ``` The same result can be achieved with much simpler configuration using batch field extraction with array format: ```hocon transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" columns = [ { "src_field" = "data" "path" = ["$.data.c_string", "$.data.c_boolean", "$.data.c_integer", "$.data.c_float", "$.data.c_double", "$.data.c_decimal", "$.data.c_date", "$.data.c_datetime", "$.data.c_array", "$.data.c_map_array"] "dest_field" = ["c1_string", "c1_boolean", "c1_integer", "c1_float", "c1_double", "c1_decimal", "c1_date", "c1_datetime", "c1_array", "c1_map_array"] "dest_type" = ["string", "boolean", "int", "float", "double", "decimal(4,2)", "date", "time", "array", "array>"] } ] } } ``` **Important:** When using batch field extraction (multiple paths, dest_fields, and dest_types), the `dest_type` parameter is **required** and cannot be omitted. Each extracted field must have a corresponding type specified. The array format provides better readability and is less error-prone than string-based configurations. Then the data result table `fake1` will like this | data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array | |------------------------------|------------------|------------|------------|----------|-----------|------------|------------|--------------|-----------------------------| | too much content not to show | this is a string | true | 42 | 3.14 | 3.14 | 10.55 | 2023-10-29 | 16:12:43.459 | ["item1", "item2", "item3"] | ## Read SeatunnelRow Example Suppose a column in a row of data is of type SeatunnelRow and that the name of the column is col
    SeatunnelRow(col)other
    nameage....
    a18....
    The JsonPath transform converts the values of seatunnel into an array, ```hocon transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" row_error_handle_way = FAIL columns = [ { "src_field" = "col" "path" = "$[0]" "dest_field" = "name" "dest_type" = "string" }, { "src_field" = "col" "path" = "$[1]" "dest_field" = "age" "dest_type" = "int" } ] } } ``` Then the data result table `fake1` will like this | name | age | col | other | |------|-----|----------|-------| | a | 18 | ["a",18] | ... | ## Configure error data handle way You can configure `row_error_handle_way` and `column_error_handle_way` to handle abnormal data. Both are optional. `row_error_handle_way` is used to handle all data anomalies in the row data, while `column_error_handle_way` is used to handle data anomalies in a column. It has a higher priority than `row_error_handle_way`. ### Skip error data rows Configure to skip row data with exceptions in any column ```hocon transform { JsonPath { row_error_handle_way = SKIP columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ### Skip error data column Configure only `json_data_f1` column data exceptions to skip and fill in null values, other column data exceptions will continue to throw exception interrupt handlers ```hocon transform { JsonPath { row_error_handle_way = FAIL columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" "column_error_handle_way" = "SKIP" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ### Skip the row for specified column error Configure to skip the row of data only for `json_data_f1` column data exceptions, and continue to throw exceptions to interrupt the handler for other column data exceptions ```hocon transform { JsonPath { row_error_handle_way = FAIL columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" "column_error_handle_way" = "SKIP_ROW" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ## Changelog * Add JsonPath Transform ================================================ FILE: docs/en/transforms/llm.md ================================================ # LLM > LLM transform plugin ## Description Leverage the power of a large language model (LLM) to process data by sending it to the LLM and receiving the generated results. Utilize the LLM's capabilities to label, clean, enrich data, perform data inference, and more. ## Options | name | type | required | default value | |------------------------|--------|----------|---------------| | model_provider | enum | yes | | | output_data_type | enum | no | String | | output_column_name | string | no | llm_output | | prompt | string | yes | | | inference_columns | list | no | | | model | string | yes | | | api_key | string | yes | | | api_path | string | no | | | custom_config | map | no | | | custom_response_parse | string | no | | | custom_request_headers | map | no | | | custom_request_body | map | no | | ### model_provider The model provider to use. The available options are: OPENAI, DOUBAO, DEEPSEEK, KIMIAI, MICROSOFT, ZHIPU, CUSTOM > tips: If you use Microsoft, please make sure api_path cannot be empty ### output_data_type The data type of the output data. The available options are: STRING,INT,BIGINT,DOUBLE,BOOLEAN. Default value is STRING. ### output_column_name Custom output data field name. A custom field name that is the same as an existing field name is replaced with 'llm_output'. ### prompt The prompt to send to the LLM. This parameter defines how LLM will process and return data, eg: The data read from source is a table like this: | name | age | |---------------|-----| | Jia Fan | 20 | | Hailin Wang | 20 | | Eric | 20 | | Guangdong Liu | 20 | The prompt can be: ``` Determine whether someone is Chinese or American by their name ``` The result will be: | name | age | llm_output | |---------------|-----|------------| | Jia Fan | 20 | Chinese | | Hailin Wang | 20 | Chinese | | Eric | 20 | American | | Guangdong Liu | 20 | Chinese | ### inference_columns The `inference_columns` option allows you to specify which columns from the input data should be used as inputs for the LLM. By default, all columns will be used as inputs. For example: ```hocon transform { LLM { model_provider = OPENAI model = gpt-4o-mini api_key = sk-xxx inference_columns = ["name", "age"] prompt = "Determine whether someone is Chinese or American by their name" } } ``` ### model The model to use. Different model providers have different models. For example, the OpenAI model can be `gpt-4o-mini`. If you use OpenAI model, please refer https://platform.openai.com/docs/models/model-endpoint-compatibility of `/v1/chat/completions` endpoint. ### api_key The API key to use for the model provider. If you use OpenAI model, please refer https://platform.openai.com/docs/api-reference/api-keys of how to get the API key. ### api_path The API path to use for the model provider. In most cases, you do not need to change this configuration. If you are using an API agent's service, you may need to configure it to the agent's API address. ### custom_config The `custom_config` option allows you to provide additional custom configurations for the model. This is a map where you can define various settings that might be required by the specific model you're using. ### custom_response_parse The `custom_response_parse` option allows you to specify how to parse the model's response. You can use JsonPath to extract the specific data you need from the response. For example, by using `$.choices[*].message.content`, you can extract the `content` field values from the following JSON. For more details on using JsonPath, please refer to the [JsonPath Getting Started guide](https://github.com/json-path/JsonPath?tab=readme-ov-file#getting-started). ```json { "id": "chatcmpl-9s4hoBNGV0d9Mudkhvgzg64DAWPnx", "object": "chat.completion", "created": 1722674828, "model": "gpt-4o-mini", "choices": [ { "index": 0, "message": { "role": "assistant", "content": "[\"Chinese\"]" }, "logprobs": null, "finish_reason": "stop" } ], "usage": { "prompt_tokens": 107, "completion_tokens": 3, "total_tokens": 110 }, "system_fingerprint": "fp_0f03d4f0ee", "code": 0, "msg": "ok" } ``` ### custom_request_headers The `custom_request_headers` option allows you to define custom headers that should be included in the request sent to the model's API. This is useful if the API requires additional headers beyond the standard ones, such as authorization tokens, content types, etc. ### custom_request_body The `custom_request_body` option supports placeholders: - `${model}`: Placeholder for the model name. - `${input}`: Placeholder to determine input value and define request body request type based on the type of body value. Example: `"${input}"` -> "input" - `${prompt}`:Placeholder for LLM model prompts. ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## tips The API interface usually has a rate limit, which can be configured with Seatunnel's speed limit to ensure smooth operation of the task. For details about Seatunnel speed limit Settings, please refer to [speed-limit](../introduction/concepts/speed-limit.md) for details. ## Example OPENAI Determine the user's country through a LLM. ```hocon env { parallelism = 1 job.mode = "BATCH" read_limit.rows_per_second = 10 } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "Jia Fan"], kind = INSERT} {fields = [2, "Hailin Wang"], kind = INSERT} {fields = [3, "Tomas"], kind = INSERT} {fields = [4, "Eric"], kind = INSERT} {fields = [5, "Guangdong Liu"], kind = INSERT} ] } } transform { LLM { model_provider = OPENAI model = gpt-4o-mini api_key = sk-xxx prompt = "Determine whether someone is Chinese or American by their name" } } sink { console { } } ``` ## Example KIMIAI Determine whether a person is a historical emperor of China. ```hocon env { parallelism = 1 job.mode = "BATCH" read_limit.rows_per_second = 10 } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "Zhuge Liang"], kind = INSERT} {fields = [2, "Li Shimin"], kind = INSERT} {fields = [3, "Sun Wukong"], kind = INSERT} {fields = [4, "Zhu Yuanzhuang"], kind = INSERT} {fields = [5, "George Washington"], kind = INSERT} ] } } transform { LLM { model_provider = KIMIAI model = moonshot-v1-8k api_key = sk-xxx prompt = "Determine whether a person is a historical emperor of China" output_data_type = boolean } } sink { console { } } ``` ### Customize the LLM model ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "Jia Fan"], kind = INSERT} {fields = [2, "Hailin Wang"], kind = INSERT} {fields = [3, "Tomas"], kind = INSERT} {fields = [4, "Eric"], kind = INSERT} {fields = [5, "Guangdong Liu"], kind = INSERT} ] plugin_output = "fake" } } transform { LLM { plugin_input = "fake" model_provider = CUSTOM model = gpt-4o-mini api_key = sk-xxx prompt = "Determine whether someone is Chinese or American by their name" openai.api_path = "http://mockserver:1080/v1/chat/completions" custom_config={ custom_response_parse = "$.choices[*].message.content" custom_request_headers = { Content-Type = "application/json" Authorization = "Bearer xxxxxxxx" } custom_request_body ={ model = "${model}" messages = [ { role = "system" content = "${prompt}" }, { role = "user" content = "${input}" }] } } plugin_output = "llm_output" } } sink { Assert { plugin_input = "llm_output" rules = { field_rules = [ { field_name = llm_output field_type = string field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ================================================ FILE: docs/en/transforms/metadata.md ================================================ # Metadata > Metadata transform plugin ## Description The Metadata transform plugin is used to extract metadata information from data rows and convert it into regular fields for subsequent processing and analysis. **Core Features:** - Extracts metadata (such as database name, table name, row type, etc.) as visible fields - Supports custom output field names - Does not modify original data fields, only adds metadata fields **Typical Use Cases:** - Recording data source (database name, table name) during CDC data synchronization - Tracking data change types (INSERT, UPDATE, DELETE) - Recording event time and delay information of data - Identifying data sources when merging multiple tables ## Supported Metadata Fields | Metadata Key | Output Type | Description | Data Source | |:---------:|:--------:|:-----------------------------:|:----:| | Database | string | Name of the database containing the data | All connectors | | Table | string | Name of the table containing the data | All connectors | | RowKind | string | Row change type, values: +I (insert), -U (update before), +U (update after), -D (delete) | All connectors | | EventTime | long | Event timestamp of data change (milliseconds) | CDC connectors; Kafka source (ConsumerRecord.timestamp) | | Delay | long | Data collection delay time (milliseconds), i.e., the difference between data extraction time and database change time | CDC connectors | | Partition | string | Partition information of the data, multiple partition fields separated by commas | Connectors supporting partitions | ### Important Notes 1. **Metadata field names are case-sensitive**: Configuration must strictly follow the Key names in the table above (e.g., `Database`, `Table`, `RowKind`, etc.) 2. **Time fields**: `Delay` is only valid when using CDC connectors (except TiDB-CDC). `EventTime` is provided by CDC connectors and also by the Kafka source via `ConsumerRecord.timestamp` when available. 3. **Kafka event time**: The Kafka source writes `ConsumerRecord.timestamp` (milliseconds) into `EventTime` when it is non-negative, so you can surface it with the `Metadata` transform. ## Options | name | type | required | default value | description | |:---------------:|------|:--------:|:-------------:|-------------------| | metadata_fields | map | no | empty map | Mapping relationship between metadata fields and output fields, format: `Metadata Key = output field name` | ### metadata_fields [map] Defines the mapping relationship between metadata fields and output fields. **Configuration Format:** ```hocon metadata_fields { = = ... } ``` **Configuration Example:** ```hocon metadata_fields { Database = source_db # Map database name to source_db field Table = source_table # Map table name to source_table field RowKind = op_type # Map row type to op_type field EventTime = event_ts # Map event time to event_ts field Delay = sync_delay # Map delay time to sync_delay field Partition = partition_info # Map partition info to partition_info field } ``` **Notes:** - The left side must be a supported metadata Key (see table above), and is strictly case-sensitive - The right side is a custom output field name, which cannot duplicate existing field names - You can select only the metadata fields you need, not all of them must be configured ## Complete Examples ### Example 1: MySQL CDC Data Synchronization, Extracting All Metadata Synchronizing data from MySQL database and extracting all available metadata information. ```yaml env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { plugin_output = "mysql_cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.users"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { Metadata { plugin_input = "mysql_cdc_source" plugin_output = "metadata_added" metadata_fields { Database = source_database # Extract database name Table = source_table # Extract table name RowKind = change_type # Extract change type EventTime = event_timestamp # Extract event time Delay = sync_delay_ms # Extract sync delay } } } sink { Console { plugin_input = "metadata_added" } } ``` **Input Data Example:** ``` Original data row (from mydb.users table): id=1, name="John", age=25 RowKind: +I (INSERT) ``` **Output Data Example:** ``` Transformed data row: id=1, name="John", age=25, source_database="mydb", source_table="users", change_type="+I", event_timestamp=1699000000000, sync_delay_ms=100 ``` --- ### Example 2: Extracting Only Partial Metadata Extracting only data source information (database name and table name) for multi-table merge scenarios. ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "multi_table_source" server-id = 5652 username = "root" password = "your_password" table-names = ["db1.orders", "db2.orders"] url = "jdbc:mysql://localhost:3306" } } transform { Metadata { plugin_input = "multi_table_source" plugin_output = "with_source_info" metadata_fields { Database = db_name Table = table_name } } } sink { Jdbc { plugin_input = "with_source_info" url = "jdbc:mysql://localhost:3306/target_db" table = "merged_orders" # Target table will contain db_name and table_name fields to identify data source } } ``` ### Example 3: Kafka record time for partitioning Expose Kafka `ConsumerRecord.timestamp` (injected into `EventTime`) as `kafka_ts`, convert it to a partition field, and write to Hive. This pattern is useful when replaying Kafka data and aligning partitions by the original record time. ```hocon env { execution.parallelism = 4 job.mode = "STREAMING" checkpoint.interval = 60000 } source { Kafka { plugin_output = "kafka_raw" schema = { fields { id = bigint customer_type = string data = string } } format = text field_delimiter = "|" topic = "push_report_event" bootstrap.servers = "kafka-broker-1:9092,kafka-broker-2:9092" consumer.group = "seatunnel_event_backfill" kafka.config = { max.poll.records = 100 auto.offset.reset = "earliest" enable.auto.commit = "false" } } } transform { Metadata { plugin_input = "kafka_raw" plugin_output = "kafka_with_meta" metadata_fields = { EventTime = "kafka_ts" } } Sql { plugin_input = "kafka_with_meta" plugin_output = "source_table" query = "select id, customer_type, data, FROM_UNIXTIME(kafka_ts/1000, 'yyyy-MM-dd', 'Asia/Shanghai') as pt from kafka_with_meta where kafka_ts >= 0" } } sink { Hive { table_name = "example_db.ods_sys_event_report" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" hdfs_site_path = "/path/to/hdfs-site.xml" hive_site_path = "/path/to/hive-site.xml" krb5_path = "/path/to/krb5.conf" kerberos_principal = "hive/metastore-1@EXAMPLE.COM" kerberos_keytab_path = "/path/to/hive.keytab" overwrite = false plugin_input = "source_table" # compress_codec = "SNAPPY" } } ``` Here `pt` is derived from the Kafka event time and can be used as a Hive partition column. ================================================ FILE: docs/en/transforms/regexextract.md ================================================ # RegexExtract > RegexExtract transform plugin ## Description The `RegexExtract` transform plugin uses regular expressions to extract data from a specified field and outputs the extracted values to new fields. It supports capture groups in regex patterns and allows setting default values for each output field when the pattern doesn't match. ## Options | name | type | required | default value | |----------------|---------|----------|---------------| | source_field | string | yes | | | regex_pattern | string | yes | | | output_fields | array | yes | | | default_values | array | no | | ### source_field [string] The source field name to extract data from. ### regex_pattern [string] The regular expression pattern with capture groups. The number of capture groups must match the number of output fields. ### output_fields [array] The names of the output fields for extracted values. The size must match the number of capture groups in the regex pattern. ### default_values [array] Default values for output fields when the regex pattern does not match or the source field is null. If provided, the size must match the number of output fields. ## Example The data read from source is a table like this: | id | email | log_entry | |----|--------------------|------------------------------------------------------| | 1 | user1@example.com | 2023-12-01 10:30:45 INFO User login successful | | 2 | admin@test.org | 2023-12-01 11:15:22 ERROR Database connection failed | | 3 | guest@domain.net | 2023-12-01 12:00:00 WARN Memory usage high | We want to extract username, domain, and top-level domain from the `email` field: ``` transform { RegexExtract { plugin_input = "fake" plugin_output = "regex_result" source_field = "email" regex_pattern = "([^@]+)@([^.]+)\\.(.+)" output_fields = ["username", "domain", "tld"] default_values = ["unknown", "unknown", "unknown"] } } ``` Then the data in result table `regex_result` will be: | id | email | log_entry | username | domain | tld | |----|--------------------|------------------------------------------------------|----------|---------|-----| | 1 | user1@example.com | 2023-12-01 10:30:45 INFO User login successful | user1 | example | com | | 2 | admin@test.org | 2023-12-01 11:15:22 ERROR Database connection failed | admin | test | org | | 3 | guest@domain.net | 2023-12-01 12:00:00 WARN Memory usage high | guest | domain | net | ## Job Config Example ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" email = "string" log_entry = "string" } } rows = [ { kind = INSERT, fields = [1, "user1@example.com", "2023-12-01 10:30:45 INFO User login successful"] }, { kind = INSERT, fields = [2, "admin@test.org", "2023-12-01 11:15:22 ERROR Database connection failed"] }, { kind = INSERT, fields = [3, "guest@domain.net", "2023-12-01 12:00:00 WARN Memory usage high"] } ] } } transform { RegexExtract { plugin_input = "fake" plugin_output = "regex_result" source_field = "email" regex_pattern = "([^@]+)@([^.]+)\\.(.+)" output_fields = ["username", "domain", "tld"] default_values = ["unknown", "unknown", "unknown"] } } sink { Console { plugin_input = "regex_result" } } ``` ## Changelog ================================================ FILE: docs/en/transforms/replace.md ================================================ # Replace > Replace transform plugin ## Description Examines string value in a given field and replaces substring of the string value that matches the given string literal or regexes with the given replacement. ## Options | name | type | required | default value | |---------------|---------|----------|---------------| | replace_field | string | yes | | | pattern | string | yes | - | | replacement | string | yes | - | | is_regex | boolean | no | false | | replace_first | boolean | no | false | ### replace_field [string] The field you want to replace ### pattern [string] The old string that will be replaced ### replacement [string] The new string for replace ### is_regex [boolean] Use regex for string match ### replace_first [boolean] Whether replace the first match string. Only used when `is_regex = true`. ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Example The data read from source is a table like this: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | We want to replace the char ` ` to `_` at the `name` field. Then we can add a `Replace` Transform like this: ``` transform { Replace { plugin_input = "fake" plugin_output = "fake1" replace_field = "name" pattern = " " replacement = "_" is_regex = true } } ``` Then the data in result table `fake1` will update to | name | age | card | |----------|-----|------| | Joy_Ding | 20 | 123 | | May_Ding | 20 | 123 | | Kin_Dom | 20 | 123 | | Joy_Dom | 20 | 123 | ## Job Config Example ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" } } } } transform { Replace { plugin_input = "fake" plugin_output = "fake1" replace_field = "name" pattern = ".+" replacement = "b" is_regex = true } } sink { Console { plugin_input = "fake1" } } ``` ## Changelog ### new version - Add Replace Transform Connector ================================================ FILE: docs/en/transforms/rowkind-extractor.md ================================================ # RowKindExtractor > RowKindExtractor transform plugin ## Description The RowKindExtractor transform plugin is used to convert CDC (Change Data Capture) data streams into Append-Only mode while extracting the original RowKind information as a new field. **Core Features:** - Converts all data rows' RowKind to `+I` (INSERT), achieving Append-Only mode - Saves the original RowKind information (INSERT, UPDATE_BEFORE, UPDATE_AFTER, DELETE) to a newly added field - Supports both short format and full format output **Why is this plugin needed?** In CDC data synchronization scenarios, data rows carry RowKind markers (+I, -U, +U, -D) representing different change types. However, some downstream systems (such as data lakes, analytical systems) only support Append-Only mode and do not support UPDATE and DELETE operations. In such cases, you need to: 1. Convert all data to INSERT type (Append-Only) 2. Save the original change type as a regular field for subsequent analysis **Transformation Example:** ``` Input (CDC data): RowKind: -D (DELETE) Data: id=1, name="test1", age=20 Output (Append-Only data): RowKind: +I (INSERT) Data: id=1, name="test1", age=20, row_kind="DELETE" ``` **Typical Use Cases:** - Writing CDC data to data lakes that only support Append mode - Preserving complete change history in data warehouses - Performing statistical analysis on different types of changes ## Options | name | type | required | default value | description | |-------------------|--------|----------|---------------|-------------| | custom_field_name | string | no | row_kind | The name of the new field used to store the original RowKind information | | transform_type | enum | no | SHORT | The output format of RowKind, options: SHORT (short format) or FULL (full format) | ### custom_field_name [string] Specifies the name of the new field that will store the original RowKind information. **Default value:** `row_kind` **Notes:** - The field name cannot duplicate existing field names, otherwise an error will be thrown - It's recommended to use meaningful names, such as `operation_type`, `change_type`, `cdc_op`, etc. **Example:** ```hocon custom_field_name = "operation_type" # Use custom field name ``` ### transform_type [enum] Specifies the output format of the RowKind field value. **Available options:** | Format | Description | Output Values | |--------|-------------|---------------| | SHORT | Short format (symbol representation) | `+I`, `-U`, `+U`, `-D` | | FULL | Full format (English names) | `INSERT`, `UPDATE_BEFORE`, `UPDATE_AFTER`, `DELETE` | **Default value:** `SHORT` **Meaning of each value:** | RowKind Type | SHORT Format | FULL Format | Description | |--------------|--------------|-------------|-------------| | INSERT | +I | INSERT | Insert operation | | UPDATE_BEFORE | -U | UPDATE_BEFORE | Value before update | | UPDATE_AFTER | +U | UPDATE_AFTER | Value after update | | DELETE | -D | DELETE | Delete operation | **Selection Recommendations:** - **SHORT format**: Saves storage space, suitable for storage-sensitive scenarios - **FULL format**: Better readability, suitable for scenarios requiring manual review or analysis **Example:** ```hocon transform_type = FULL # Use full format ``` ## Complete Examples ### Example 1: Using Default Configuration (SHORT Format) Using default configuration to convert CDC data to Append-Only mode, with RowKind saved in short format. ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.users"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { RowKindExtractor { plugin_input = "cdc_source" plugin_output = "append_only_data" # Using default configuration: # custom_field_name = "row_kind" # transform_type = SHORT } } sink { Console { plugin_input = "append_only_data" } } ``` **Data Transformation Process:** ``` Input data (CDC format): 1. RowKind=+I, id=1, name="John", age=25 2. RowKind=-U, id=1, name="John", age=25 3. RowKind=+U, id=1, name="John", age=26 4. RowKind=-D, id=1, name="John", age=26 Output data (Append-Only format): 1. RowKind=+I, id=1, name="John", age=25, row_kind="+I" 2. RowKind=+I, id=1, name="John", age=25, row_kind="-U" 3. RowKind=+I, id=1, name="John", age=26, row_kind="+U" 4. RowKind=+I, id=1, name="John", age=26, row_kind="-D" ``` --- ### Example 2: Using FULL Format with Custom Field Name Using full format to output RowKind with a custom field name. ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.orders"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { RowKindExtractor { plugin_input = "cdc_source" plugin_output = "append_only_data" custom_field_name = "operation_type" # Custom field name transform_type = FULL # Use full format } } sink { Iceberg { plugin_input = "append_only_data" catalog_name = "iceberg_catalog" database = "mydb" table = "orders_history" # Iceberg table will contain operation_type field, recording the change type of each data row } } ``` **Data Transformation Process:** ``` Input data (CDC format): 1. RowKind=+I, order_id=1001, amount=100.00 2. RowKind=-U, order_id=1001, amount=100.00 3. RowKind=+U, order_id=1001, amount=150.00 4. RowKind=-D, order_id=1001, amount=150.00 Output data (Append-Only format, FULL format): 1. RowKind=+I, order_id=1001, amount=100.00, operation_type="INSERT" 2. RowKind=+I, order_id=1001, amount=100.00, operation_type="UPDATE_BEFORE" 3. RowKind=+I, order_id=1001, amount=150.00, operation_type="UPDATE_AFTER" 4. RowKind=+I, order_id=1001, amount=150.00, operation_type="DELETE" ``` --- ### Example 3: Complete Test Example (Using FakeSource) Using FakeSource to generate test data, demonstrating the transformation effects of various RowKinds. ```yaml env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake_cdc_data" schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_updated", 95] }, { kind = UPDATE_BEFORE fields = [2, "B", 100] }, { kind = UPDATE_AFTER fields = [2, "B_updated", 98] }, { kind = DELETE fields = [1, "A_updated", 95] } ] } } transform { RowKindExtractor { plugin_input = "fake_cdc_data" plugin_output = "transformed_data" custom_field_name = "change_type" transform_type = FULL } } sink { Console { plugin_input = "transformed_data" } } ``` **Expected Output:** ``` +I, pk_id=1, name="A", score=100, change_type="INSERT" +I, pk_id=2, name="B", score=100, change_type="INSERT" +I, pk_id=1, name="A", score=100, change_type="UPDATE_BEFORE" +I, pk_id=1, name="A_updated", score=95, change_type="UPDATE_AFTER" +I, pk_id=2, name="B", score=100, change_type="UPDATE_BEFORE" +I, pk_id=2, name="B_updated", score=98, change_type="UPDATE_AFTER" +I, pk_id=1, name="A_updated", score=95, change_type="DELETE" ``` ================================================ FILE: docs/en/transforms/split.md ================================================ # Split > Split transform plugin ## Description Split a field to more than one field. ## Options | name | type | required | default value | |---------------|--------|----------|---------------| | separator | string | yes | | | split_field | string | yes | | | output_fields | array | yes | | ### separator [string] The list of fields that need to be kept. Fields not in the list will be deleted ### split_field [string] The field to be split ### output_fields [array] The result fields after split ### common options [string] Transform plugin common parameters, please refer to [Transform Plugin](common-options/common-options.md) for details ## Example The data read from source is a table like this: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | We want split `name` field to `first_name` and `second name`, we can add `Split` transform like this ``` transform { Split { plugin_input = "fake" plugin_output = "fake1" separator = " " split_field = "name" output_fields = [first_name, second_name] } } ``` Then the data in result table `fake1` will like this | name | age | card | first_name | last_name | |----------|-----|------|------------|-----------| | Joy Ding | 20 | 123 | Joy | Ding | | May Ding | 20 | 123 | May | Ding | | Kin Dom | 20 | 123 | Kin | Dom | | Joy Dom | 20 | 123 | Joy | Dom | ## Changelog ### new version - Add Split Transform Connector ================================================ FILE: docs/en/transforms/sql-functions.md ================================================ # SQL Functions > The Functions of SQL transform plugin ## String Functions ### ASCII ```ASCII(string) -> INT``` Returns the ```ASCII``` value of the first character in the string. Example: ASCII('Hi') ### BIT_LENGTH ```BIT_LENGTH(bytes) -> LONG``` Returns the number of bits in a binary string. Example: BIT_LENGTH(NAME) ### CHAR_LENGTH / LENGTH ```CHAR_LENGTH | LENGTH(string) -> LONG``` Returns the number of characters in a character string. Example: CHAR_LENGTH(NAME) ### OCTET_LENGTH ```OCTET_LENGTH(bytes) -> LONG``` Returns the number of bytes in a binary string. Example: OCTET_LENGTH(NAME) ### CHAR / CHR ```CHAR | CHR (int) -> STRING``` Returns the character that represents the ASCII value. Example: CHAR(65) ### CONCAT ```CONCAT(string, string[, string...]) -> STRING``` Combines strings. Unlike with the operator ```||```, **NULL** parameters are ignored, and do not cause the result to become **NULL**. If all parameters are NULL the result is an empty string. Example: CONCAT(NAME, '_') ### CONCAT_WS ```CONCAT_WS(separatorString, string, string[, string...]) -> STRING``` Combines strings with separator. If separator is **NULL** it is treated like an empty string. Other **NULL** parameters are ignored. Remaining **non-NULL** parameters, if any, are concatenated with the specified separator. If there are no remaining parameters the result is an empty string. Example: CONCAT_WS(',', NAME, '_') ### HEXTORAW ```HEXTORAW(string) -> STRING``` Converts a hex representation of a string to a string. 4 hex characters per string character are used. Example: HEXTORAW(DATA) ### RAWTOHEX ```RAWTOHEX(string | bytes) -> STRING``` Converts a string or bytes to the hex representation. 4 hex characters per string character are used. Example: RAWTOHEX(DATA) ### INSERT ```INSERT(originalString, startInt, lengthInt, addString) -> STRING``` Inserts an additional string into the original string at a specified start position. The length specifies the number of characters that are removed at the start position in the original string. Example: INSERT(NAME, 1, 1, ' ') ### LOWER / LCASE ```LOWER | LCASE(string) -> STRING``` Converts a string to lowercase. Example: LOWER(NAME) ### UPPER / UCASE ```UPPER | UCASE(string) -> STRING``` Converts a string to uppercase. Example: UPPER(NAME) ### LEFT ```LEFT(string, int) -> STRING``` Returns the leftmost number of characters. Example: LEFT(NAME, 3) ### RIGHT ```RIGHT(string, int) -> STRING``` Returns the rightmost number of characters. Example: RIGHT(NAME, 3) ### LOCATE / INSTR / POSITION ```LOCATE(searchString, string[, startInt]) -> INT``` ```INSTR(string, searchString[, startInt]) -> INT``` ```POSITION(searchString, string) -> INT``` Returns the location of a search string in a string. If a start position is used, the characters before it are ignored. If position is negative, the rightmost location is returned. 0 is returned if the search string is not found. Please note this function is case sensitive, even if the parameters are not. Example: LOCATE('.', NAME) ### LPAD ```LPAD(string, int[, string]) -> STRING``` Left pad the string to the specified length. If the length is shorter than the string, it will be truncated at the end. If the padding string is not set, spaces will be used. Example: LPAD(AMOUNT, 10, '*') ### RPAD ```RPAD(string, int[, string]) -> STRING``` Right pad the string to the specified length. If the length is shorter than the string, it will be truncated. If the padding string is not set, spaces will be used. Example: RPAD(TEXT, 10, '-') ### LTRIM ```LTRIM(string[, characterToTrimString]) -> STRING``` Removes all leading spaces or other specified characters from a string. Example: LTRIM(NAME) ### RTRIM ```RTRIM(string[, characterToTrimString]) -> STRING``` Removes all trailing spaces or other specified characters from a string. Example: RTRIM(NAME) ### TRIM ```TRIM(string[, characterToTrimString]) -> STRING``` Removes all leading spaces and trailing spaces or other specified characters from a string. Example: TRIM(NAME) ### REGEXP_REPLACE ```REGEXP_REPLACE(inputString, regexString, replacementString[, flagsString]) -> STRING``` Replaces each substring that matches a regular expression. For details, see the Java String.replaceAll() method. If any parameter is null (except optional flagsString parameter), the result is null. Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. 'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) 'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) 'n' allows the period to match the newline character (Pattern.DOTALL) 'm' enables multiline mode (Pattern.MULTILINE) Example: REGEXP_REPLACE('Hello World', ' +', ' ') REGEXP_REPLACE('Hello WWWWorld', 'w+', 'W', 'i') ### REGEXP_LIKE ```REGEXP_LIKE(inputString, regexString[, flagsString]) -> BOOLEAN``` Matches string to a regular expression. For details, see the Java Matcher.find() method. If any parameter is null (except optional flagsString parameter), the result is null. Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. 'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) 'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) 'n' allows the period to match the newline character (Pattern.DOTALL) 'm' enables multiline mode (Pattern.MULTILINE) Example: REGEXP_LIKE('Hello World', '[A-Z ]*', 'i') ### REGEXP_SUBSTR ```REGEXP_SUBSTR(inputString, regexString[, positionInt, occurrenceInt, flagsString, groupInt]) -> STRING``` Matches string to a regular expression and returns the matched substring. For details, see the java.util.regex.Pattern and related functionality. The parameter position specifies where in inputString the match should start. Occurrence indicates which occurrence of pattern in inputString to search for. Flags values are limited to 'i', 'c', 'n', 'm'. Other symbols cause exception. Multiple symbols could be used in one flagsString parameter (like 'im'). Later flags override first ones, for example 'ic' is equivalent to case sensitive matching 'c'. 'i' enables case insensitive matching (Pattern.CASE_INSENSITIVE) 'c' disables case insensitive matching (Pattern.CASE_INSENSITIVE) 'n' allows the period to match the newline character (Pattern.DOTALL) 'm' enables multiline mode (Pattern.MULTILINE) If the pattern has groups, the group parameter can be used to specify which group to return. Example: REGEXP_SUBSTR('2020-10-01', '\d{4}') REGEXP_SUBSTR('2020-10-01', '(\d{4})-(\d{2})-(\d{2})', 1, 1, NULL, 2) ### REPEAT ```REPEAT(string, int) -> STRING``` Returns a string repeated some number of times. Example: REPEAT(NAME || ' ', 10) ### REPLACE ```REPLACE(string, searchString[, replacementString]) -> STRING``` Replaces all occurrences of a search string in a text with another string. If no replacement is specified, the search string is removed from the original string. If any parameter is null, the result is null. Example: REPLACE(NAME, ' ') ### SPLIT ```SPLIT(string, delimiterString) -> ARRAY``` Split a string into an array. Example: select SPLIT(test,';') as arrays ### MURMUR64 ```MURMUR64(string) -> LONG``` Calculate MurmurHash 128 for the input string and return the lower 64 bits as a long value. MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. This method returns a long value, or null if the input parameter is null. Example: MURMUR64('hello world') MURMUR64(NAME) ### SOUNDEX ```SOUNDEX(string) -> STRING``` Returns a four character code representing the sound of a string. This method returns a string, or null if parameter is null. See https://en.wikipedia.org/wiki/Soundex for more information. Example: SOUNDEX(NAME) ### SPACE ```SPACE(int) -> STRING``` Returns a string consisting of a number of spaces. Example: SPACE(80) ### SUBSTRING / SUBSTR ```SUBSTRING | SUBSTR(string, startInt[, lengthInt ]) -> STRING``` Returns a substring of a string starting at a position. If the start index is negative, then the start index is relative to the end of the string. The length is optional. Example: CALL SUBSTRING('[Hello]', 2); CALL SUBSTRING('hour', 3, 2); ### TO_CHAR ```TO_CHAR(value[, formatString]) -> STRING``` Oracle-compatible TO_CHAR function that can format a timestamp, a number, or text. Example: CALL TO_CHAR(SYS_TIME, 'yyyy-MM-dd HH:mm:ss') ### TRANSLATE ```TRANSLATE(value, searchString, replacementString) -> STRING``` Oracle-compatible TRANSLATE function that replaces a sequence of characters in a string with another set of characters. Example: CALL TRANSLATE('Hello world', 'eo', 'EO') ## Numeric Functions ### ABS ```ABS(numeric) -> NUMERIC (same type)``` Returns the absolute value of a specified value. The returned value is of the same data type as the parameter. Note that TINYINT, SMALLINT, INT, and BIGINT data types cannot represent absolute values of their minimum negative values, because they have more negative values than positive. For example, for INT data type allowed values are from -2147483648 to 2147483647. ABS(-2147483648) should be 2147483648, but this value is not allowed for this data type. It leads to an exception. To avoid it cast argument of this function to a higher data type. Example: ABS(I) ### ACOS ```ACOS(numeric) -> DOUBLE``` Calculate the arc cosine. See also Java Math.acos. Example: ACOS(D) ### ARRAY_MAX ```ARRAY_MAX(ARRAY) -> type(array element)``` The MAX function returns the maximum value of the expression. Example: ARRAY_MAX(I) ### ARRAY_MIN ```ARRAY_MIN(ARRAY) -> type(array element)``` The MIN function returns the minimum value of the expression. Example: ARRAY_MIN(I) ### ASIN ```ASIN(numeric) -> DOUBLE``` Calculate the arc sine. See also Java Math.asin. Example: ASIN(D) ### ATAN ```ATAN(numeric) -> DOUBLE``` Calculate the arc tangent. See also Java Math.atan. Example: ATAN(D) ### COS ```COS(numeric) -> DOUBLE``` Calculate the trigonometric cosine. See also Java Math.cos. Example: COS(ANGLE) ### COSH ```COSH(numeric) -> DOUBLE``` Calculate the hyperbolic cosine. See also Java Math.cosh. Example: COSH(X) ### COT ```COT(numeric) -> DOUBLE``` Calculate the trigonometric cotangent (1/TAN(ANGLE)). See also Java Math.* functions. Example: COT(ANGLE) ### SIN ```SIN(numeric) -> DOUBLE``` Calculate the trigonometric sine. See also Java Math.sin. Example: SIN(ANGLE) ### SINH ```SINH(numeric) -> DOUBLE``` Calculate the hyperbolic sine. See also Java Math.sinh. Example: SINH(ANGLE) ### TAN ```TAN(numeric) -> DOUBLE``` Calculate the trigonometric tangent. See also Java Math.tan. Example: TAN(ANGLE) ### TANH ```TANH(numeric) -> DOUBLE``` Calculate the hyperbolic tangent. See also Java Math.tanh. Example: TANH(X) ### MOD ```MOD(dividendNumeric, divisorNumeric ) -> type(divisorNumeric)``` The modulus expression. Result is NULL if either of arguments is NULL. If divisor is 0, an exception is raised. Result has the same sign as dividend or is equal to 0. Usually arguments should have scale 0, but it isn't required by H2. Example: MOD(A, B) ### CEIL / CEILING ```CEIL | CEILING (numeric) -> NUMERIC (same type, scale 0)``` Returns the smallest integer value that is greater than or equal to the argument. This method returns value of the same type as argument, but with scale set to 0 and adjusted precision, if applicable. Example: CEIL(A) ### EXP ```EXP(numeric) -> DOUBLE``` See also Java Math.exp. Example: EXP(A) ### FLOOR ```FLOOR(numeric) -> NUMERIC (same type, scale 0)``` Returns the largest integer value that is less than or equal to the argument. This method returns value of the same type as argument, but with scale set to 0 and adjusted precision, if applicable. Example: FLOOR(A) ### LN ```LN(numeric) -> DOUBLE``` Calculates the natural (base e) logarithm. Argument must be a positive numeric value. Example: LN(A) ### LOG ```LOG(baseNumeric, numeric) -> DOUBLE``` Calculates the logarithm with specified base. Argument and base must be positive numeric values. Base cannot be equal to 1. The default base is e (natural logarithm), in the PostgreSQL mode the default base is base 10. In MSSQLServer mode the optional base is specified after the argument. Single-argument variant of LOG function is deprecated, use LN or LOG10 instead. Example: LOG(2, A) ### LOG10 ```LOG10(numeric) -> DOUBLE``` Calculates the base 10 logarithm. Argument must be a positive numeric value. Example: LOG10(A) ### RADIANS ```RADIANS(numeric) -> DOUBLE``` See also Java Math.toRadians. Example: RADIANS(A) ### SQRT ```SQRT(numeric) -> DOUBLE``` See also Java Math.sqrt. Example: SQRT(A) ### PI ```PI() -> DOUBLE``` See also Java Math.PI. Example: PI() ### POWER ```POWER(numeric, numeric) -> DOUBLE``` See also Java Math.pow. Example: POWER(A, B) ### RAND / RANDOM ```RAND | RANDOM([ int ]) -> DOUBLE``` Calling the function without parameter returns the next a pseudo random number. Calling it with an parameter seeds the session's random number generator. This method returns a double between 0 (including) and 1 (excluding). Example: RAND() ### ROUND ```ROUND(numeric[, digitsInt]) -> NUMERIC (same type)``` Rounds to a number of fractional digits. This method returns value of the same type as argument, but with adjusted precision and scale, if applicable. Example: ROUND(N, 2) ### SIGN ```SIGN(numeric) -> INT``` Returns -1 if the value is smaller than 0, 0 if zero or NaN, and otherwise 1. Example: SIGN(N) ### TRUNC ```TRUNC | TRUNCATE(numeric[, digitsInt]) -> NUMERIC (same type)``` When a numeric argument is specified, truncates it to a number of digits (to the next value closer to 0) and returns value of the same type as argument, but with adjusted precision and scale, if applicable. Example: TRUNC(N, 2) ### TRIM_SCALE ```TRIM_SCALE(numeric) -> NUMERIC (same type)``` Reduce the scale of a number by removing trailing zeroes. The scale is adjusted accordingly. Example: TRIM_SCALE(N) ## Time and Date Functions ### CURRENT_DATE ```CURRENT_DATE [()] -> DATE``` Returns the current date. These functions return the same value within a transaction (default) or within a command depending on database mode. Example: CURRENT_DATE ### CURRENT_TIME ```CURRENT_TIME [()] -> TIME``` Returns the current time with system time zone. The actual maximum available precision depends on operating system and JVM and can be 3 (milliseconds) or higher. Higher precision is not available before Java 9. Example: CURRENT_TIME ### CURRENT_TIMESTAMP / NOW ```CURRENT_TIMESTAMP[()] | NOW() -> TIMESTAMP``` Returns the current timestamp with system time zone. The actual maximum available precision depends on operating system and JVM and can be 3 (milliseconds) or higher. Higher precision is not available before Java 9. Example: CURRENT_TIMESTAMP ### DATEADD / TIMESTAMPADD ```DATEADD | TIMESTAMPADD(dateAndTime, addIntLong, datetimeFieldString) -> type(dateAndTime)``` Adds units to a date-time value. The datetimeFieldString indicates the unit. Use negative values to subtract units. addIntLong may be a long value when manipulating milliseconds, microseconds, or nanoseconds otherwise its range is restricted to int. This method returns a value with the same type as specified value if unit is compatible with this value. If specified field is a HOUR, MINUTE, SECOND, MILLISECOND, etc and value is a DATE value DATEADD returns combined TIMESTAMP. Fields DAY, MONTH, YEAR, WEEK, etc are not allowed for TIME values. Example: DATEADD(CREATED, 1, 'MONTH') ### DATEDIFF ```DATEDIFF(aDateAndTime, bDateAndTime, datetimeFieldString) -> LONG``` Returns the number of crossed unit boundaries between two date-time values. The datetimeField indicates the unit. Example: DATEDIFF(T1.CREATED, T2.CREATED, 'MONTH') ### DATE_TRUNC ```DATE_TRUNC(dateAndTime, datetimeFieldString) -> dateAndTime (same type)``` Truncates the specified date-time value to the specified field. Example: DATE_TRUNC(CREATED, 'DAY') ### DAYNAME ```DAYNAME(dateAndTime) -> STRING``` Returns the name of the day (in English). Example: DAYNAME(CREATED) ### DAY_OF_MONTH ```DAY_OF_MONTH(dateAndTime) -> INT``` Returns the day of the month (1-31). Example: DAY_OF_MONTH(CREATED) ### DAY_OF_WEEK ```DAY_OF_WEEK(dateAndTime) -> INT``` Returns the day of the week (1-7) (Monday-Sunday), locale-specific. Example: DAY_OF_WEEK(CREATED) ### DAY_OF_YEAR ```DAY_OF_YEAR(dateAndTime) -> INT``` Returns the day of the year (1-366). Example: DAY_OF_YEAR(CREATED) ### EXTRACT ```EXTRACT(datetimeField FROM dateAndTime) -> INT | NUMERIC``` Returns a value of the specific time unit from a date/time value. This method returns a numeric value with EPOCH field and an int for all other fields. The following are valid field names for EXTRACT: - `CENTURY`: The century; for interval values, the year field divided by 100 - `DAY`: The day of the month (1-31); for interval values, the number of days - `DECADE`: The year field divided by 10 - `DOW` or `DAYOFWEEK`: The day of the week as Sunday (0) to Saturday (6) - `DOY`: The day of the year (1-365/366) - `EPOCH`: For timestamp values, the number of seconds since 1970-01-01 00:00:00; for interval values, the total number of seconds - `HOUR`: The hour field (0-23) - `ISODOW`: The day of the week as Monday (1) to Sunday (7), matching ISO 8601 - `ISOYEAR`: The ISO 8601 week-numbering year - `MICROSECONDS`: The seconds field, including fractional parts, multiplied by 1,000,000 - `MILLENNIUM`: The millennium; for interval values, the year field divided by 1000 - `MILLISECONDS`: The seconds field, including fractional parts, multiplied by 1,000 - `MINUTE`: The minutes field (0-59) - `MONTH`: The number of the month within the year (1-12); for interval values, the number of months modulo 12 (0-11) - `QUARTER`: The quarter of the year (1-4) that the date is in - `SECOND`: The seconds field, including any fractional seconds - `WEEK`: The number of the ISO 8601 week-numbering week of the year (1-53) - `YEAR`: The year field The EXTRACT function supports all four DateTime literal types: - `DATE`: For extracting date components from a date literal ```sql EXTRACT(YEAR FROM DATE '2025-05-21') ``` - `TIME`: For extracting time components from a time literal ```sql EXTRACT(HOUR FROM TIME '17:57:40') ``` - `TIMESTAMP`: For extracting date and time components from a timestamp literal ```sql EXTRACT(YEAR FROM TIMESTAMP '2025-05-21T17:57:40') ``` - `TIMESTAMP WITH TIMEZONE`: For extracting components from a timestamp with timezone literal ```sql EXTRACT(HOUR FROM TIMESTAMPTZ '2025-05-21T17:57:40+08:00') ``` Examples: ```sql EXTRACT(YEAR FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(HOUR FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(DOW FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(YEAR FROM eventTime) EXTRACT(HOUR FROM eventTime) EXTRACT(DOW FROM eventTime) ``` ### FORMATDATETIME ```FORMATDATETIME(dateAndTime, formatString) -> STRING``` Formats a date, time or timestamp as a string. The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see java.time.format.DateTimeFormatter. Example: CALL FORMATDATETIME(CREATED, 'yyyy-MM-dd HH:mm:ss') ### HOUR ```HOUR(dateAndTime) -> INT``` Returns the hour (0-23) from a date/time value. Example: HOUR(CREATED) ### MINUTE ```MINUTE(dateAndTime) -> INT``` Returns the minute (0-59) from a date/time value. This function is deprecated, use EXTRACT instead of it. Example: MINUTE(CREATED) ### MONTH ```MONTH(dateAndTime) -> INT``` Returns the month (1-12) from a date/time value. This function is deprecated, use EXTRACT instead of it. Example: MONTH(CREATED) ### MONTHNAME ```MONTHNAME(dateAndTime) -> STRING``` Returns the name of the month (in English). Example: MONTHNAME(CREATED) ### IS_DATE ```IS_DATE(string, formatString) -> BOOLEAN``` Validates whether a string can be parsed as a date/time value using the specified format pattern. **Supported Format Patterns:** DateTime Formats: - `yyyy-MM-dd HH:mm:ss` - Standard datetime format - `yyyy-MM-dd HH:mm:ss.SSS` - Datetime with milliseconds - `yyyy-MM-dd'T'HH:mm:ss` - ISO 8601 datetime format - `yyyy-MM-dd'T'HH:mm:ss.SSS` - ISO 8601 datetime with milliseconds - `yyyy/MM/dd HH:mm:ss` - Datetime with slash separator - `yyyy/MM/dd HH:mm:ss.SSS` - Datetime with slash separator and milliseconds - `yyyyMMddHHmmss` - Compact datetime format Date Formats: - `yyyy-MM-dd` - ISO 8601 date format - `yyyy/MM/dd` - Date with slash separator - `yyyyMMdd` - Compact date format Time Formats: - `HH:mm:ss` - Standard time format - `HH:mm:ss.SSS` - Time with milliseconds - `HHmmss` - Compact time format Example: ```sql CALL IS_DATE('2021-04-08 13:34:45', 'yyyy-MM-dd HH:mm:ss') -- Returns true CALL IS_DATE('2021/04/08', 'yyyy/MM/dd') -- Returns true CALL IS_DATE('20210408', 'yyyyMMdd') -- Returns true -- Consistent with TO_DATE SELECT CASE WHEN IS_DATE(date_string, 'yyyy-MM-dd HH:mm:ss') THEN TO_DATE(date_string, 'yyyy-MM-dd HH:mm:ss') ELSE NULL END as parsed_date ``` ### PARSEDATETIME / TO_DATE ```PARSEDATETIME | TO_DATE(string, formatString) -> TIMESTAMP | DATE | TIME``` Parses a string into a date/time value using the specified format pattern. **Supported Format Patterns:** DateTime Formats (returns TIMESTAMP): - `yyyy-MM-dd HH:mm:ss` - Standard datetime format - `yyyy-MM-dd HH:mm:ss.SSS` - Datetime with milliseconds - `yyyy-MM-dd'T'HH:mm:ss` - ISO 8601 datetime format - `yyyy-MM-dd'T'HH:mm:ss.SSS` - ISO 8601 datetime with milliseconds - `yyyy/MM/dd HH:mm:ss` - Datetime with slash separator - `yyyy/MM/dd HH:mm:ss.SSS` - Datetime with slash separator and milliseconds - `yyyyMMddHHmmss` - Compact datetime format Date Formats (returns DATE): - `yyyy-MM-dd` - ISO 8601 date format - `yyyy/MM/dd` - Date with slash separator - `yyyyMMdd` - Compact date format Time Formats (returns TIME): - `HH:mm:ss` - Standard time format - `HH:mm:ss.SSS` - Time with milliseconds - `HHmmss` - Compact time format **Note:** When using single quotes (`'`) in format patterns (e.g., for ISO 8601 'T' separator), they must be escaped as `''` in SQL. Examples: ```sql -- DateTime examples CALL PARSEDATETIME('2021-04-08 13:34:45', 'yyyy-MM-dd HH:mm:ss') CALL TO_DATE('2021-04-08T13:34:45', 'yyyy-MM-dd''T''HH:mm:ss') CALL PARSEDATETIME('2024-06-15 14:30:45.123', 'yyyy-MM-dd HH:mm:ss.SSS') CALL PARSEDATETIME('2021/04/08 13:34:45', 'yyyy/MM/dd HH:mm:ss') CALL PARSEDATETIME('20210408133445', 'yyyyMMddHHmmss') -- Date examples CALL TO_DATE('2021-04-08', 'yyyy-MM-dd') CALL TO_DATE('2021/04/08', 'yyyy/MM/dd') CALL TO_DATE('20210408', 'yyyyMMdd') -- Time examples CALL PARSEDATETIME('14:30:45', 'HH:mm:ss') CALL PARSEDATETIME('14:30:45.123', 'HH:mm:ss.SSS') CALL PARSEDATETIME('143045', 'HHmmss') ``` ### QUARTER ```QUARTER(dateAndTime) -> INT``` Returns the quarter (1-4) from a date/time value. Example: QUARTER(CREATED) ### SECOND ```SECOND(dateAndTime) -> INT``` Returns the second (0-59) from a date/time value. This function is deprecated, use EXTRACT instead of it. Example: SECOND(CREATED) ### WEEK ```WEEK(dateAndTime) -> INT``` Returns the week (1-53) from a date/time value. This function uses the current system locale. Example: WEEK(CREATED) ### YEAR ```YEAR(dateAndTime) -> INT``` Returns the year from a date/time value. Example: YEAR(CREATED) ### FROM_UNIXTIME ```FROM_UNIXTIME(unixtime, formatString, timeZone) -> STRING``` Convert the number of seconds from the UNIX epoch (1970-01-01 00:00:00 UTC) to a string representing the timestamp of that moment. The most important format characters are: y year, M month, d day, H hour, m minute, s second. For details of the format, see `java.time.format.DateTimeFormatter`. `timeZone` is optional, default value is system's time zone. `timezone` value can be a `UTC+ timezone offset`, for example, `UTC+8` represents the Asia/Shanghai time zone, see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones . Example: // use default zone CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss') or // use given zone CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss','UTC+6') ### AT TIME ZONE ```dateAndTime AT TIME ZONE 'timeZone' -> TIMESTAMP_TZ``` Convert a timestamp value to a TIMESTAMP WITH TIME ZONE value in the specified time zone. `timeZone` value can be a `UTC+ timezone offset`, for example, `+08:00` represents the Asia/Shanghai time zone, see https://en.wikipedia.org/wiki/List_of_tz_database_time_zones . Example: local_date_time AT TIME ZONE '+09:00' offset_date_time AT TIME ZONE 'Pacific/Honolulu' ## System Functions ### CAST ```CAST(value as dataType) -> dataType``` Converts a value to another data type. Supported data types: STRING | VARCHAR, TINYINT, SMALLINT, INT | INTEGER, LONG | BIGINT, BYTE, FLOAT, DOUBLE, DECIMAL(p,s), TIMESTAMP, DATE, TIME, BYTES, BOOLEAN Example: * CAST(NAME AS INT) * CAST(FLAG AS BOOLEAN) NOTE: Converts a value to a BOOLEAN data type according to the following rules: 1. If the value can be interpreted as a boolean string (`'true'` or `'false'`), it returns the corresponding boolean value. 2. If the value can be interpreted as a numeric value (`1` or `0`), it returns `true` for `1` and `false` for `0`. 3. If the value cannot be interpreted according to the above rules, it throws a `TransformException`. ### TRY_CAST ```TRY_CAST(value as dataType) -> dataType | NULL``` This function is similar to CAST, but when the conversion fails, it returns NULL instead of throwing an exception. Supported data types: STRING | VARCHAR, TINYINT, SMALLINT, INT | INTEGER, LONG | BIGINT, BYTE, FLOAT, DOUBLE, DECIMAL(p,s), TIMESTAMP, DATE, TIME, BYTES Example: TRY_CAST(NAME AS INT) ### COALESCE ```COALESCE(aValue, bValue [,...]) -> type(of first non-null arg)``` Returns the first value that is not null. If subsequent arguments have different data types from the first argument, they will be automatically converted to the type of the first argument. Example: COALESCE(A, B, C) Example with type conversion: ``` -- If A is a string field and B is an integer field -- B will be converted to string when A is null SELECT COALESCE(A, B) as result FROM my_table ``` ### IFNULL ```IFNULL(aValue, bValue) -> type(common of args)``` Returns the first value that is not null. If subsequent arguments have different data types from the first argument, they will be automatically converted to the type of the first argument. Example: IFNULL(A, B) ### NULLIF ```NULLIF(aValue, bValue) -> type(aValue) | NULL``` Returns NULL if 'a' is equal to 'b', otherwise 'a'. Example: NULLIF(A, B) ### MULTI_IF ```MULTI_IF(condition1, value1, condition2, value2,... conditionN, valueN, bValue) -> type(of values)``` returns the first value for which the corresponding condition is true. If all conditions are false, it returns the last value. Example: MULTI_IF(A > 1, 'A', B > 1, 'B', C > 1, 'C', 'D') ### CASE WHEN ```CASE WHEN THEN [WHEN...] [ELSE ] END -> type(of result expressions)``` Returns different values based on conditions. ``` select case when c_string in ('c_string') then 1 else 0 end as c_string_1, case when c_string not in ('c_string') then 1 else 0 end as c_string_0, case when c_tinyint = 117 and TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_1, case when c_tinyint != 117 and TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_0, case when c_tinyint != 117 or TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_or_1, case when c_int > 1 and c_bigint > 1 and c_float > 1 and c_double > 1 and c_decimal > 1 then 1 else 0 end as c_number_1, case when c_tinyint <> 117 then 1 else 0 end as c_number_0, case when c_boolean then 1 else 0 end as c_boolean_0 from dual ``` It is used to determine whether the condition is valid and return different values according to different judgments Example: case when c_string in ('c_string') then 1 else 0 end case when c_string in ('c_string') then true else false end ### UUID ```UUID() -> STRING``` Generate a uuid through java function. Example: select UUID() as seatunnel_uuid ### ARRAY ```ARRAY array(T, ...) -> ARRAY``` Create an array consisting of variadic elements and return it. Here, T can be either “column” or “literal”. Example: select ARRAY(1,2,3) as arrays select ARRAY('c_1',2,3.12) as arrays select ARRAY(column1,column2,column3) as arrays notes: Currently only string, double, long, int types are supported ### LATERAL VIEW #### EXPLODE ```EXPLODE(array of T) -> rows(value: T)``` ```OUTER EXPLODE(array of T) -> rows(value: T | NULL)``` Used to flatten array columns into multiple rows. It applies the EXPLODE function to an array and generates a new row for each element. EXPLODE: Converts an array column into multiple rows. No rows generated if array is NULL or empty. OUTER EXPLODE: Returns NULL when array is NULL or empty, ensuring at least one row is generated. EXPLODE(SPLIT(field_name, separator)): Splits a string into an array using the specified separator, then explodes it into rows. EXPLODE(ARRAY(value1, value2, ...)): Explodes a custom-defined array into multiple rows. Example: ``` SELECT * FROM dual LATERAL VIEW EXPLODE ( SPLIT ( NAME, ',' ) ) AS NAME LATERAL VIEW EXPLODE ( SPLIT ( pk_id, ';' ) ) AS pk_id LATERAL VIEW OUTER EXPLODE ( age ) AS age LATERAL VIEW OUTER EXPLODE ( ARRAY(1,1) ) AS num ``` ## Vector Functions ### VECTOR_DIMS ```VECTOR_DIMS(vector) -> INT``` Returns an INT value representing the number of dimensions (elements) in the vector. Example: VECTOR_DIMS(vector) ### VECTOR_NORM ```VECTOR_NORM(vector) -> DOUBLE``` Calculates the L2 norm (Euclidean norm) of a vector, which represents the length or magnitude of the vector. Example: VECTOR_NORM(vector) ### INNER_PRODUCT ```INNER_PRODUCT(vector1, vector2) -> DOUBLE``` Calculates the inner product (dot product) of two vectors, which is used to measure the similarity and projection between the vectors. Example: INNER_PRODUCT(vector1, vector2) ### COSINE_DISTANCE ```COSINE_DISTANCE(vector1, vector2) -> DOUBLE``` Returns a DOUBLE value between 0 and 1: 0: Identical vectors (completely similar) 1: Orthogonal vectors (completely dissimilar) Example: COSINE_DISTANCE(vector1, vector2) ### L1_DISTANCE ```L1_DISTANCE(vector1, vector2) -> DOUBLE``` Calculates the Manhattan (L1) distance between two vectors. Example: L1_DISTANCE(vector1, vector2) ### L2_DISTANCE ```L2_DISTANCE(vector1, vector2) -> DOUBLE``` Calculates the Euclidean (L2) distance between two vectors. Example: L2_DISTANCE(vector1, vector2) ### VECTOR_REDUCE ```VECTOR_REDUCE(vector_field, target_dimension, method)``` Generic vector dimension reduction function that supports multiple reduction methods. **Parameters:** - `vector_field`: The vector field to reduce (VECTOR type) - `target_dimension`: The target dimension (INTEGER, must be smaller than source dimension) - `method`: The reduction method (STRING): - **'TRUNCATE'**: Truncates the vector by keeping only the first N elements. This is the simplest and fastest dimension reduction method, but may lose important information in the truncated dimensions. - **'RANDOM_PROJECTION'**: Uses Gaussian random projection with normally distributed random matrix. This method preserves relative distances between vectors while reducing dimensionality, following the Johnson-Lindenstrauss lemma. - **'SPARSE_RANDOM_PROJECTION'**: Uses sparse random projection where matrix elements are mostly zero (±√3, 0). This is more computationally efficient than regular random projection while maintaining similar distance preservation properties. **Returns:** VECTOR type with reduced dimensions **Example:** ```sql SELECT id, VECTOR_REDUCE(embedding, 256, 'TRUNCATE') as reduced_embedding FROM table SELECT id, VECTOR_REDUCE(embedding, 128, 'RANDOM_PROJECTION') as reduced_embedding FROM table SELECT id, VECTOR_REDUCE(embedding, 64, 'SPARSE_RANDOM_PROJECTION') as reduced_embedding FROM table ``` ### VECTOR_NORMALIZE ```VECTOR_NORMALIZE(vector_field)``` Normalizes a vector to unit length (magnitude = 1). This is useful for computing cosine similarity. **Parameters:** - `vector_field`: The vector field to normalize (VECTOR type) **Returns:** VECTOR type - the normalized vector **Example:** ```sql SELECT id, VECTOR_NORMALIZE(embedding) as normalized_embedding FROM table ``` ================================================ FILE: docs/en/transforms/sql-udf.md ================================================ # SQL UDF > UDF of SQL transform plugin ## Description Use UDF SPI to extend the SQL transform functions lib. ## UDF API ```java package org.apache.seatunnel.transform.sql.zeta; public interface ZetaUDF { /** * Function name * * @return function name */ String functionName(); /** * The type of function result * * @param argsType input arguments type * @return result type */ SeaTunnelDataType resultType(List> argsType); /** * Evaluate * * @param args input arguments * @return result value */ Object evaluate(List args); /** * Whether current udf requires row level context. */ default boolean requiresContext() { return false; } /** * Evaluate with row level context. */ default Object evaluateWithContext(List args, ZetaUDFContext context) { return evaluate(args); } /** * Initialize udf resources. */ default void open() throws Exception {} /** * Release udf resources. */ default void close() {} } ``` `ZetaUDFContext` provides runtime row-level metadata and fields: - `getRawTableId()` - `getDatabase()` - `getSchema()` - `getTable()` - `getRowKind()` - `getAllFields()` Notes: - `database/schema/table` parsing follows `TablePath.of(tableId)` semantics. - If `tableId` is in an unsupported format, accessing `database/schema/table` throws `IllegalArgumentException`. - Existing UDFs remain backward compatible and continue using `evaluate(List args)`. ## UDF Implements Example Add these dependencies and provided scope to your maven project. **Dependency versions should match the runtime environment.** ```xml org.apache.seatunnel seatunnel-transforms-v2 ${seatunnel.version} provided org.apache.seatunnel seatunnel-api ${seatunnel.version} provided com.google.auto.service auto-service 1.0.1 provided ``` Add a Java Class implements of ZetaUDF like this: ```java @AutoService(ZetaUDF.class) public class ExampleUDF implements ZetaUDF { @Override public String functionName() { return "EXAMPLE"; } @Override public SeaTunnelDataType resultType(List> argsType) { return BasicType.STRING_TYPE; } @Override public Object evaluate(List args) { String arg = (String) args.get(0); if (arg == null) return null; return "UDF: " + arg; } } ``` Package the UDF project and copy the jar to the path: ${SEATUNNEL_HOME}/lib. And if your UDF use third party library, you also need put it to ${SEATUNNEL_HOME}/lib. If you use cluster mode, you need put the lib to all your node's ${SEATUNNEL_HOME}/lib folder and re-start the cluster. ## Context-aware & lifecycle UDF example ```java @AutoService(ZetaUDF.class) public class ContextLifecycleUdf implements ZetaUDF { private transient String prefix; @Override public String functionName() { return "CTX_LIFE"; } @Override public SeaTunnelDataType resultType(List> argsType) { return BasicType.STRING_TYPE; } @Override public boolean requiresContext() { return true; } @Override public void open() { this.prefix = "OPENED"; } @Override public Object evaluateWithContext(List args, ZetaUDFContext context) { String arg = args.get(0) == null ? null : String.valueOf(args.get(0)); if (arg == null) { return null; } return prefix + ":" + context.getRowKind().shortString() + ":" + arg; } @Override public void close() { this.prefix = null; } } ``` ## Example The data read from source is a table like this: | id | name | age | |----|----------|-----| | 1 | Joy Ding | 20 | | 2 | May Ding | 21 | | 3 | Kin Dom | 24 | | 4 | Joy Dom | 22 | We use UDF of SQL query to transform the source data like this: ``` transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, example(name) as name, age from dual" } } ``` Then the data in result table `fake1` will update to | id | name | age | |----|---------------|-----| | 1 | UDF: Joy Ding | 20 | | 2 | UDF: May Ding | 21 | | 3 | UDF: Kin Dom | 24 | | 4 | UDF: Joy Dom | 22 | ## Changelog ### new version - Add UDF of SQL Transform Connector ================================================ FILE: docs/en/transforms/sql.md ================================================ # SQL > SQL transform plugin ## Description Use SQL to transform given input row. SQL transform use memory SQL engine, we can via SQL functions and ability of SQL engine to implement the transform task. ## Options | name | type | required | default value | |-------------------|--------|----------|---------------| | plugin_input | string | yes | - | | plugin_output | string | yes | - | | query | string | yes | - | ### plugin_input [string] The source table name, the query SQL table name must match this field. ### query [string] The query SQL, it's a simple SQL supported base function and criteria filter operation. But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like. the query expression can be `select [table_name.]column_a` to query the column that named `column_a`. and the table name is optional. or `select c_row.c_inner_row.column_b` to query the inline struct column that named `column_b` within `c_row` column and `c_inner_row` column. **In this query expression, can't have table name.** ## Example The data read from source is a table like this: | id | name | age | |----|----------|-----| | 1 | Joy Ding | 20 | | 2 | May Ding | 21 | | 3 | Kin Dom | 24 | | 4 | Joy Dom | 22 | We use SQL query to transform the source data like this: ``` transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, concat(name, '_') as name, age+1 as age from dual where id>0" } } ``` Then the data in result table `fake1` will update to | id | name | age | |----|-----------|-----| | 1 | Joy Ding_ | 21 | | 2 | May Ding_ | 22 | | 3 | Kin Dom_ | 25 | | 4 | Joy Dom_ | 23 | ### Struct query if your upstream data schema is like this: ```hacon source { FakeSource { plugin_output = "fake" row.num = 100 string.template = ["innerQuery"] schema = { fields { name = "string" c_date = "date" c_row = { c_inner_row = { c_inner_int = "int" c_inner_string = "string" c_inner_timestamp = "timestamp" c_map_1 = "map" c_map_2 = "map>" } c_string = "string" } } } } } ``` Those query all are valid: ```sql select name, c_date, c_row, c_row.c_inner_row, c_row.c_string, c_row.c_inner_row.c_inner_int, c_row.c_inner_row.c_inner_string, c_row.c_inner_row.c_inner_timestamp, c_row.c_inner_row.c_map_1, c_row.c_inner_row.c_map_1.some_key ``` But this query are not valid: ```sql select c_row.c_inner_row.c_map_2.some_key.inner_map_key ``` The map must be the latest struct, can't query the nesting map. ## Job Config Example ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" } } } } transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, concat(name, '_') as name, age+1 as age from dual where id>0" } } sink { Console { plugin_input = "fake1" } } ``` ## Changelog - Support struct query ### new version - Add SQL Transform Connector ================================================ FILE: docs/en/transforms/table-filter.md ================================================ # TableFilter > TableFilter transform plugin ## Description TableFilter transform plugin for filter tables. ## Options | name | type | required | default value | Description | |:----------------:|--------|----------|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| | database_pattern | string | no | | Specify database filter pattern, the default value is null, which means no filtering. If you want to filter the database name, please set it to a regular expression. | | schema_pattern | string | no | | Specify schema filter pattern, the default value is null, which means no filtering. If you want to filter the schema name, please set it to a regular expression. | | table_pattern | string | no | | Specify table filter pattern, the default value is null, which means no filtering. If you want to filter the table name, please set it to a regular expression. | | pattern_mode | string | no | INCLUDE | Specify pattern mode, the default value is INCLUDE, which means include the matched table. If you want to exclude the matched table, please set it to EXCLUDE. | ## Examples ### Include filter tables Include filter tables with the name matching the regular expression `user_\d+` in the database `test`. ```hocon transform { TableFilter { plugin_input = "source1" plugin_output = "transform_a_1" database_pattern = "test" table_pattern = "user_\\d+" } } ``` ### Exclude filter tables Exclude filter tables with the name matching the regular expression `user_\d+` in the database `test`. ```hocon transform { TableFilter { plugin_input = "source1" plugin_output = "transform_a_1" database_pattern = "test" table_pattern = "user_\\d+" pattern_mode = "EXCLUDE" } } ``` ================================================ FILE: docs/en/transforms/table-merge.md ================================================ # TableMerge > TableMerge transform plugin ## Description TableMerge transform plugin for merge sharding-tables. ## Options | name | type | required | default value | Description | |:--------:|--------|----------|---------------|---------------------------| | database | string | no | | Specify new database name | | schema | string | no | | Specify new schema name | | table | string | yes | | Specify new table name | ## Examples ### Merge sharding-tables ` ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_1", "source.user_2", "source.shop"] url = "jdbc:mysql://localhost:3306/source" } } transform { TableMerge { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" table_match_regex = "source.user_.*" database = "user_db" table = "user_all" } } sink { Jdbc { plugin_input = "trans_result" driver="com.mysql.cj.jdbc.Driver" url="jdbc:mysql://localhost:3306/sink" user="myuser" password="mypwd" generate_sink_sql = true database = "${database_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/en/transforms/table-rename.md ================================================ # TableRename > TableRename transform plugin ## Description TableRename transform plugin for rename table name. ## Options | name | type | required | default value | Description | |:-----------------------:|--------|----------|---------------|-----------------------------------------------------------------------------------------------------------------------| | convert_case | string | no | | The case conversion type. The options can be `UPPER`, `LOWER` | | prefix | string | no | | The prefix to be added to the table name | | suffix | string | no | | The suffix to be added to the table name | | replacements_with_regex | array | no | | The array of replacement rules with regex. The replacement rule is a map with `replace_from` and `replace_to` fields. | ## Examples ### Convert table name to uppercase ``` env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_shop", "source.user_order"] url = "jdbc:mysql://localhost:3306/source" } } transform { TableRename { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" convert_case = "UPPER" prefix = "CDC_" suffix = "_TABLE" replacements_with_regex = [ { replace_from = "user" replace_to = "U" } ] } } sink { Jdbc { plugin_input = "trans_result" driver="oracle.jdbc.OracleDriver" url="jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user="myuser" password="mypwd" generate_sink_sql = true database = "ORCLCDB" table = "${database_name}.${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ### Convert table name to lowercase ``` env { parallelism = 1 job.mode = "STREAMING" } source { Oracle-CDC { plugin_output = "customers_oracle_cdc" url = "jdbc:oracle:thin:@localhost:1521/ORCLCDB" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["SOURCE.USER_SHOP", "SOURCE.USER_ORDER"] } } transform { TableRename { plugin_input = "customers_oracle_cdc" plugin_output = "trans_result" convert_case = "LOWER" prefix = "cdc_" suffix = "_table" replacements_with_regex = [ { replace_from = "USER" replace_to = "u" } ] } } sink { Jdbc { plugin_input = "trans_result" url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = "${schema_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/en/transforms/transform-multi-table.md ================================================ --- sidebar_position: 2 --- # Multi-Table Transform in SeaTunnel SeaTunnel’s transform feature supports multi-table transformations, which is especially useful when the upstream plugin outputs multiple tables. This allows you to complete all necessary transformation operations within a single transform configuration. Currently, many connectors in SeaTunnel support multi-table outputs, such as `JDBCSource` and `MySQL-CDC`. All transforms can be configured for multi-table transform as described below. :::tip Multi-table Transform has no limitations on Transform capabilities; any Transform configuration can be used in a multi-table Transform. The purpose of multi-table Transform is to handle multiple tables in the data stream individually and merge the Transform configurations of multiple tables into one Transform for easier management. ::: ## Properties | Name | Type | Required | Default | Description | |----------------------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | table_match_regex | String | No | .* | A regular expression to match the tables that require transformation. By default, it matches all tables. Note that this table name refers to the actual upstream table name, not `plugin_output`. | | table_transform | List | No | - | You can use a list in `table_transform` to specify rules for individual tables. If a transformation rule is configured for a specific table in `table_transform`, the outer rules will not apply to that table. The rules in `table_transform` take precedence. | | table_transform.table_path | String | No | - | When configuring a transformation rule for a table in `table_transform`, you need to specify the table path using the `table_path` field. The table path should include `databaseName[.schemaName].tableName`. | ## Matching Logic Suppose we read five tables from upstream: `test.abc`, `test.abcd`, `test.xyz`, `test.xyzxyz`, and `test.www`. They share the same structure, each having three fields: `id`, `name`, and `age`. | id | name | age | Now, let's say we want to copy the data from these five tables using the Copy transform with the following specific requirements: - For tables `test.abc` and `test.abcd`, we need to copy the `name` field to a new field `name1`. - For `test.xyz`, we want to copy the `name` field to `name2`. - For `test.xyzxyz`, we want to copy the `name` field to `name3`. - For `test.www`, no changes are needed. We can configure this as follows: ```hocon transform { Copy { plugin_input = "fake" // Optional dataset name to read from plugin_output = "fake1" // Optional dataset name for output table_match_regex = "test.a.*" // 1. Matches tables needing transformation, here matching `test.abc` and `test.abcd` src_field = "name" // Source field dest_field = "name1" // Destination field table_transform = [{ table_path = "test.xyz" // 2. Specifies the table name for transformation src_field = "name" // Source field dest_field = "name2" // Destination field }, { table_path = "test.xyzxyz" src_field = "name" dest_field = "name3" }] } } ``` ### Explanation 1. With the regular expression and corresponding Copy transform options, we match tables `test.abc` and `test.abcd` and copy the `name` field to `name1`. 2. Using the `table_transform` configuration, we specify that for table `test.xyz`, the `name` field should be copied to `name2`. This allows us to handle transformations for multiple tables within a single transform configuration. For each table, the priority of configuration is: `table_transform` > `table_match_regex`. If no rules match a table, no transformation will be applied. Below are the transform configurations for each table: - **test.abc** and **test.abcd** ```hocon transform { Copy { src_field = "name" dest_field = "name1" } } ``` Output structure: | id | name | age | name1 | - **test.xyz** ```hocon transform { Copy { src_field = "name" dest_field = "name2" } } ``` Output structure: | id | name | age | name2 | - **test.xyzxyz** ```hocon transform { Copy { src_field = "name" dest_field = "name3" } } ``` Output structure: | id | name | age | name3 | - **test.www** ```hocon transform { // No transformation needed } ``` Output structure: | id | name | age | In this example, we used the Copy transform, but all transforms in SeaTunnel support multi-table transformations, and you can configure them similarly within the corresponding transform block. ================================================ FILE: docs/sidebars.js ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // @ts-check /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { "docs": [ { "type": "category", "label": "Introduction", "items": [ "introduction/about", "introduction/how-it-works", { "type": "category", "label": "Concepts", "items": [ "introduction/concepts/config", "introduction/concepts/connector-v2-features", "introduction/concepts/schema-feature" ] }, { "type": "category", "label": "Configuration", "items": [ "introduction/configuration/JobEnvConfig", "introduction/configuration/sql-config", "introduction/configuration/config-encryption-decryption", "introduction/configuration/metalake", "introduction/configuration/sink-options-placeholders", "introduction/configuration/schema-evolution", "introduction/configuration/speed-limit" ] } ] }, { "type": "category", "label": "Architecture", "items": [ "architecture/overview", "architecture/design-philosophy", { "type": "category", "label": "API Design", "items": [ "architecture/api-design/source-architecture", "architecture/api-design/sink-architecture", "architecture/api-design/catalog-table", "architecture/api-design/translation-layer" ] }, { "type": "category", "label": "Engine", "items": [ "architecture/engine/engine-architecture", "architecture/engine/dag-execution", "architecture/engine/resource-management" ] }, { "type": "category", "label": "Fault Tolerance", "items": [ "architecture/fault-tolerance/checkpoint-mechanism", "architecture/fault-tolerance/exactly-once" ] }, { "type": "category", "label": "Features", "items": [ "architecture/features/multi-table" ] } ] }, { "type": "category", "label": "Getting Started", "items": [ { "type": "category", "label": "Locally", "items": [ "getting-started/locally/deployment", "getting-started/locally/quick-start-seatunnel-engine", "getting-started/locally/quick-start-flink", "getting-started/locally/quick-start-spark" ] }, { "type": "category", "label": "Docker", "items": [ "getting-started/docker/docker" ] }, { "type": "category", "label": "Kubernetes", "items": [ "getting-started/kubernetes/kubernetes", "getting-started/kubernetes/helm" ] } ] }, { "type": "category", "label": "Connectors", "items": [ "connectors/connector-isolated-dependency", { "type": "category", "label": "Source", "link": { "type": "generated-index", "title": "Source Connectors", "description": "List all source connectors supported by Apache SeaTunnel.", "slug": "/connectors/source", "keywords": ["source"], "image": "/img/favicon.ico" }, "items": [ { "type": "autogenerated", "dirName": "connectors/source" } ] }, { "type": "category", "label": "Sink", "link": { "type": "generated-index", "title": "Sink Connectors", "description": "List all sink connectors supported by Apache SeaTunnel.", "slug": "/connectors/sink", "keywords": ["sink"], "image": "/img/favicon.ico" }, "items": [ { "type": "autogenerated", "dirName": "connectors/sink" } ] }, { "type": "category", "label": "Formats", "link": { "type": "generated-index", "title": "Formats", "description": "List some special formats supported by Apache SeaTunnel.", "slug": "/connectors/formats", "keywords": ["formats"], "image": "/img/favicon.ico" }, "items": [ { "type": "autogenerated", "dirName": "connectors/formats" } ] }, { "type": "category", "label": "Common Options", "items": [ "connectors/common-options/source-common-options", "connectors/common-options/sink-common-options" ] }, { "type": "category", "label": "Changelog", "link": { "type": "generated-index", "title": "Connector Changelog", "description": "Changelog for all connectors supported by Apache SeaTunnel.", "slug": "/connectors/changelog", "keywords": ["changelog"], "image": "/img/favicon.ico" }, "items": [ { "type": "autogenerated", "dirName": "connectors/changelog" } ] } ] }, { "type": "category", "label": "Transforms", "link": { "type": "generated-index", "title": "Transforms", "description": "List all transforms supported by Apache SeaTunnel.", "slug": "/transforms", "keywords": ["transforms"], "image": "/img/favicon.ico" }, "items": [ { "type": "category", "label": "Common Options", "items": [ "transforms/common-options/common-options" ] }, "transforms/copy", "transforms/data-validator", "transforms/define-sink-type", "transforms/dynamic-compile", "transforms/embedding", "transforms/field-mapper", "transforms/field-rename", "transforms/filter", "transforms/filter-rowkind", "transforms/jsonpath", "transforms/llm", "transforms/metadata", "transforms/regexextract", "transforms/replace", "transforms/rowkind-extractor", "transforms/split", "transforms/sql", "transforms/sql-functions", "transforms/sql-udf", "transforms/table-filter", "transforms/table-merge", "transforms/table-rename", "transforms/transform-multi-table" ] }, { "type": "category", "label": "Engines", "items": [ "engines/overview", "engines/event-listener", { "type": "category", "label": "SeaTunnel Engine (Zeta)", "items": [ "engines/zeta/about", { "type": "category", "label": "Deployment", "items": [ "engines/zeta/download-seatunnel", "engines/zeta/deployment", "engines/zeta/local-mode-deployment", "engines/zeta/hybrid-cluster-deployment", "engines/zeta/separated-cluster-deployment" ] }, "engines/zeta/checkpoint-storage", "engines/zeta/engine-jar-storage-mode", "engines/zeta/tcp", "engines/zeta/resource-isolation", { "type": "category", "label": "REST API", "items": [ "engines/zeta/rest-api-v1", "engines/zeta/rest-api-v2", "engines/zeta/security" ] }, "engines/zeta/user-command", "engines/zeta/logging", "engines/zeta/telemetry", "engines/zeta/web-ui", "engines/zeta/slot-allocation-strategy", "engines/zeta/tuning-guide" ] }, { "type": "category", "label": "Command", "items": [ "engines/command/usage", "engines/command/connector-check" ] }, "engines/flink", "engines/spark" ] }, { "type": "category", "label": "Tools", "items": [ "tools/overview", "tools/seatunnel-skill", "tools/seatunnel-mcp", "tools/x2seatunnel" ] }, { "type": "category", "label": "Developer", "items": [ "developer/setup", "developer/coding-guide", "developer/how-to-create-your-connector", "developer/contribute-plugin", "developer/contribute-transform-v2-guide", "developer/docs-format-specification", "developer/new-license" ] }, "faq" ] }; module.exports = sidebars; ================================================ FILE: docs/zh/architecture/api-design/catalog-table.md ================================================ --- sidebar_position: 4 title: CatalogTable 和元数据管理 --- # CatalogTable 和元数据管理 ## 1. 概述 ### 1.1 问题背景 数据集成需要显式的模式管理: - **模式定义**: 如何定义和验证表模式? - **模式传播**: 如何在数据源(source) → 转换器(transform) → 目标端(sink)之间传递模式? - **模式演化**: 如何处理运行时 DDL 变更(添加/删除列)? - **类型映射**: 如何在不同数据源之间映射类型? - **元数据完整性**: 如何捕获完整的表元数据(约束、分区)? ### 1.2 设计目标 SeaTunnel 的元数据管理旨在: 1. **类型安全**: 在作业提交时进行显式模式验证 2. **完整性**: 捕获所有表元数据(列、约束、分区、选项) 3. **支持演化**: 处理运行时模式变更(DDL 同步) 4. **引擎独立**: 模式表示独立于执行引擎 5. **易用性**: 用于模式创建和转换的简单 API ## 2. 核心概念 ### 2.1 CatalogTable 包含所有元数据的表的完整表示。 CatalogTable 是 SeaTunnel 对“表及其元数据”的统一表示,通常包含: - **tableId**: 表标识(可定位到 catalog/database/schema/table) - **tableSchema**: 模式定义(列、主键、约束等) - **options**: 连接器/表级选项(如实际表名、topic、format 等) - **partitionKeys**: 分区键(可选) - **comment/catalogName**: 注释与归属 catalog 信息(可选) **关键组件**: - `TableIdentifier`: 唯一表标识(catalog.database[.schema].table) - `TableSchema`: 包含列、主键、约束的模式 - `options`: 连接器特定设置(例如 Kafka 主题、JDBC 表名) - `partitionKeys`: 分区表的分区列 ### 2.2 TableSchema 包含列和约束的模式定义。 TableSchema 关注“表有哪些列,以及这些列有哪些约束”: - **columns**: 列定义列表(顺序敏感) - **primaryKey**: 主键定义(可选) - **constraintKeys**: 唯一键/外键等约束(可选) ### 2.3 Column 包含类型和约束的列定义。 Column 通常由以下信息构成: - **name**: 列名 - **dataType**: SeaTunnelDataType 统一类型 - **nullable/defaultValue**: 空值与默认值语义 - **comment/options**: 备注与连接器/列级扩展选项 ### 2.4 SeaTunnelDataType 跨连接器的统一类型系统。 **基本类型**(示例): - 数值: TINYINT/SMALLINT/INT/BIGINT/FLOAT/DOUBLE/DECIMAL(precision, scale) - 字符串: STRING/CHAR(length)/VARCHAR(length) - 二进制: BYTES - 日期/时间: DATE/TIME/TIMESTAMP - 布尔: BOOLEAN **复杂类型**(示例): - ARRAY(elementType) - MAP(keyType, valueType) - ROW(fields) ## 3. 模式创建 ### 3.1 构建器模式 推荐的构建步骤: 1. 明确 TableIdentifier(作业内唯一定位) 2. 通过 TableSchema.Builder 按顺序定义 columns 3. 若需要去重/更新语义,定义 primaryKey 4. 写入 options(连接器侧的物理映射信息) 5. 如为分区表,补充分区键 partitionKeys ### 3.2 列构建器 列定义需要尽量显式: - name/dataType 是必选 - nullable/defaultValue 决定写入与 DDL 的语义 - comment/options 用于补充连接器侧能力(例如精度、编码、额外属性) ### 3.3 主键和约束 约束表达要点: - primaryKey/uniqueKey 是“语义约束”,用于: - 转换/下游写入侧的幂等键选择 - schema 兼容性校验 - 部分连接器的 DDL 自动生成 - 外键等约束在跨系统同步时常受限于目标端能力与时序一致性,通常需要在“可用性/一致性”之间做权衡 ## 4. 模式传播 ### 4.1 数据源 → 转换器 → 目标端流程 ``` ┌──────────────┐ │数据源(source) │ │ │ │ 生产 │ │ CatalogTable │ └──────┬───────┘ │ ▼ (输入模式) ┌──────────────┐ │ 转换器 │ │ │ │ 修改 │ │ CatalogTable │ └──────┬───────┘ │ ▼ (输出模式) ┌──────────────┐ │ 目标端 │ │ │ │ 验证 │ │ CatalogTable │ └──────────────┘ ``` ### 4.2 数据源模式生产 数据 Source 读取端的职责: - 从外部系统读取元数据(列、类型、主键/唯一键、分区、注释等) - 将外部类型映射为 SeaTunnelDataType - 产出 CatalogTable,作为作业的“输入契约” 常见失败模式: - 元数据读取失败(权限/网络/超时) - 类型无法映射(外部类型超出 SeaTunnel 统一类型系统) - schema 漂移(运行中 DDL)导致“生产的 CatalogTable”与真实数据不一致 ### 4.3 转换器模式转换 转换器端的职责: - 根据转换逻辑(表达式/字段选择/重命名等)计算输出 schema - 保证输出 CatalogTable 可被下游 sink 验证与消费 常见风险: - schema 推断不精确(例如 UDF、动态字段) - 类型提升/缩窄导致的精度或溢出问题 - 字段重命名/删除导致下游找不到列 ### 4.4 目标端模式验证 目标端侧的职责: - 获取输入 CatalogTable(来自上游) - 获取目标端的真实表/索引元数据(或根据配置选择 auto-create) - 做兼容性校验: - 列是否存在/是否允许自动新增 - 类型是否兼容(是否允许安全扩展) - 约束/主键是否满足写入语义(尤其是 upsert/exactly-once) 推荐策略: - 早期失败:在作业启动阶段就完成校验,避免运行中才暴露不可写入 - 明确兼容规则:哪些类型扩展允许、哪些缩窄禁止、如何处理 nullability 变化 ## 5. 模式演化 ### 5.1 SchemaChangeEvent SchemaChangeEvent 表示 **CDC 数据源捕获到的 DDL/元数据变更**,用于在数据流中传递“表结构发生了什么变化”。 核心语义: - 变更必须能定位到具体表(TableIdentifier/TablePath 等) - 变更类型是可枚举的(如新增列、删除列、修改列、重命名、主键/约束变化等) - 变更负载以“语义化描述”为主(列名、类型、nullable、默认值等),而不是下游可直接执行的 SQL 为什么要事件化: - 对上游 CDC 而言,结构变化是数据的一部分,必须被可靠传播 - 对下游(Transform/Sink)而言,结构变化通常需要与“业务兼容性规则”共同决策(允许/禁止、自动/人工) 失败模式与建议: - 事件丢失:下游 schema 与数据不一致,建议将 schema 事件纳入 checkpoint/恢复语义(至少保证“数据与变更事件的相对顺序”可恢复) - 顺序错乱:先收到数据后收到 DDL,建议在 Source 侧保证同一表内顺序一致,或在下游做缓冲与重放 - 不可应用变更:例如删除列/缩窄类型导致不可写,建议启动阶段明确策略并在运行时可观测告警 ### 5.2 CDC 数据源模式演化 CDC Source 的职责不是“执行 DDL”,而是 **把变更识别出来并以事件形式注入数据流**。 推荐工作流: 1. 捕获上游变更(binlog/redo log/DDL log/元数据快照差异) 2. 解析为结构化事件(新增/删除/修改列等) 3. 与数据事件一同向下游发出,保证同一表内的顺序可解释 4. 在 checkpoint/恢复时保证:不会出现“数据前进但 schema 事件回退”的不可恢复状态 常见边界: - DDL 批量发生:可能产生多个事件,应明确合并/拆分规则与顺序 - 同名列重复/大小写规则:需与 Catalog/TableIdentifier 规范对齐 - DDL 解析失败:建议降级为“停止作业 + 明确报错”,或按配置选择“跳过变更 + 记录告警”(默认不推荐) ### 5.3 转换器模式演化映射 Transform 侧需要回答的问题是:**上游 schema 变化,在经过转换逻辑后,等价的下游变化是什么?** 典型规则: - 字段选择:如果下游不再保留该列,则“新增列事件”可被忽略;但“删除列事件”可能仍需要传播以便下游校验 - 字段重命名:需要把事件中的列名同步映射 - 类型转换:需要把“上游类型变化”映射为“下游类型变化”(例如 cast、精度变化) - 表达式生成列:上游新增列不一定影响下游,但下游可能新增派生列(属于转换器内部 schema 变化) 失败模式: - 无法判定影响:例如 UDF 返回动态字段,建议显式配置输出 schema 或选择“禁止自动演化” - 不可逆转换:例如精度缩窄/字符串解析失败,建议在演化阶段就拒绝或要求人工介入 ### 5.4 目标端模式演化应用 Sink 侧的职责是 **对变更做兼容性决策并落地到目标系统**(如果启用自动演化)。 推荐处理流程: 1. 获取目标端当前表/索引元数据(可能来自 Catalog、JDBC 元数据、Hive Metastore 等) 2. 按策略判断是否允许该类变更(如自动建表、自动新增列、是否允许 drop/rename) 3. 将“语义事件”转换成目标系统的 DDL/元数据 API 调用 4. 将变更落地动作纳入可恢复语义: - 如果 sink 支持 2PC/事务,则尽量在 commit 阶段与数据提交协同 - 如果目标端 DDL 不能事务化,至少保证幂等与可重试(例如“列已存在”视为成功) 失败模式与建议: - DDL 执行失败:目标端权限/锁冲突/存储限制,建议快速失败并输出明确告警,避免 silent skip - 并发变更:多个并行 writer 同时尝试演化,建议统一到单点/串行执行(或使用外部锁) - 演化与写入竞争:写入在 DDL 未生效时到达,建议在应用变更后再放行数据,或使用缓冲/重试 ## 6. 类型映射 ### 6.1 JDBC 类型映射 JDBC 类型映射的目标是把“目标系统类型”规范化为 SeaTunnel 内部类型(SeaTunnelDataType),从而让上游/下游对齐 schema 语义。 映射原则: - 尽量保持语义而非字面:例如 `VARCHAR`/`LONGVARCHAR` 最终都可能落到 `STRING` - 保留关键约束:长度、精度、scale、时区(如果目标系统支持) - 明确不可映射类型的策略:快速失败 vs 降级为 `STRING/BYTES`(默认建议失败) 兼容性与风险: - 精度相关:`DECIMAL(p,s)` 的 `p/s` 需要完整保留,否则可能出现截断/溢出 - 时间相关:`TIMESTAMP`/`TIMESTAMP WITH TIME ZONE` 的语义差异需要明确 - 二进制相关:`BINARY/VARBINARY` 建议映射为 `BYTES`,不要静默转字符串 ### 6.2 Kafka (Avro) 类型映射 Avro/Protobuf/JSON Schema 等“消息协议”通常是嵌套结构,映射时需要同时处理: - 基础类型:int/long/string/bytes/bool 等 - 复合类型:array/map/record(对应 SeaTunnel 的 ARRAY/MAP/ROW) - 兼容性规则:新增字段、字段默认值、union/nullability 推荐策略: - 将 `record` 映射为 `ROW`,并保持字段顺序与名字稳定 - 对 nullable:显式表达(而不是隐式 union) - 对 schema registry:把 schema 版本作为可观测信息输出,便于排障与回滚 ## 7. 分区表 ### 7.1 分区定义 分区信息是 CatalogTable 的一部分:它把“表 schema”与“物理分布/组织方式”连接起来。 分区键的典型用途: - 让 Source 能按分区裁剪(partition pruning),减少扫描范围 - 让 Sink 能按分区写入,提高写入性能并避免热点 - 让下游表管理系统(Hive/Iceberg/Hudi)正确理解数据布局 ### 7.2 分区感知数据源 Source 侧的关键是:从外部元数据系统读取“分区键定义”并写入 Produced CatalogTable。 推荐能力: - 支持分区过滤条件(按时间/范围),并明确过滤是在“枚举 split”阶段完成 - 分区元数据缺失时快速失败,避免静默全表扫描 ### 7.3 分区感知目标端 Sink 侧的关键是:把输入行映射到正确分区并以目标系统要求的方式提交。 常见失败模式: - 分区键缺失/为空:需要明确处理策略(拒绝、写入默认分区、或降级为非分区写入) - 分区字段类型不匹配:建议在启动阶段做 schema 校验 - 并发写入同分区:需要考虑文件/小文件合并、提交冲突与幂等 ## 8. 最佳实践 ### 8.1 模式定义 **优先使用显式模式**: - 推荐:在配置或作业定义阶段显式给出 schema(字段名、类型、nullable、精度等) - 不推荐:完全依赖运行时推断(尤其是“取第一行推断”),容易在脏数据或字段漂移时产生不可恢复的问题 **选择合适类型**: - 推荐:金额/计数等使用 `DECIMAL(p,s)`/`BIGINT` 等精确类型;时间使用 `DATE/TIME/TIMESTAMP` - 不推荐:将所有字段降级为 `STRING`,会把错误推迟到下游并放大数据质量成本 ### 8.2 模式验证 **早期验证**(快速失败): - Source:在 open/prepare 阶段确定 Produced CatalogTable,并完成“字段存在性/类型合法性/可投影性”等验证 - Sink:在作业启动阶段完成“输入 schema 与目标表 schema”的兼容性校验,避免运行中才暴露不可写入 ### 8.3 类型兼容性 **类型扩展(通常安全)**: - `INT → BIGINT` - `FLOAT → DOUBLE` - `VARCHAR(10) → VARCHAR(20)` **类型缩窄(通常不安全)**: - `BIGINT → INT`(溢出风险) - `DOUBLE → FLOAT`(精度损失) - `VARCHAR(20) → VARCHAR(10)`(截断风险) ## 9. 配置 ### 9.1 模式覆盖 ```hocon source { Jdbc { url = "..." query = "SELECT * FROM users" # 覆盖推断的模式 schema { fields { id = "BIGINT" name = "STRING" age = "INT" } } } } ``` ### 9.2 模式演化控制 在 **CDC 场景**下,SeaTunnel 的模式演化通常由 **CDC Source 侧开关**控制:在 CDC 源启用 `schema-changes.enabled = true` 后,运行时 DDL/元数据变更会随数据流传播;下游 Sink 是否能自动应用变更取决于连接器是否支持 schema evolution。 下面给出一个“CDC → JDBC Sink”的最小可用示例(参数以各连接器文档为准): ```hocon source { MySQL-CDC { url = "..." table-names = ["db.table"] # 启用 CDC 模式变更事件(SchemaChangeEvent)传播 schema-changes.enabled = true } } sink { Jdbc { url = "..." # 让 JDBC sink 能根据上游 schema 生成/刷新写入 SQL generate_sink_sql = true # 作业启动阶段:若表不存在则创建(用于首次建表) schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" } } ``` > 说明:当前仓库中没有“schema-evolution 统一配置块”这一通用写法。 > 新增/删除/重命名列等是否自动应用由具体 Sink 实现与目标端能力决定;其中 DROP/RENAME 属于高风险操作,建议在生产环境谨慎启用并做好灰度与回滚预案。 ## 10. 相关资源 - [source 数据源架构](source-architecture.md) - [sink 目标端架构](sink-architecture.md) - [模式演化](../../introduction/concepts/schema-evolution.md) - [模式特性](../../introduction/concepts/schema-feature.md) ================================================ FILE: docs/zh/architecture/api-design/sink-architecture.md ================================================ --- sidebar_position: 3 title: 数据写入 Sink 架构 --- # 数据写入 Sink 架构 ## 1. 概述 ### 1.1 问题背景 在分布式环境中向外部系统写入数据面临关键挑战: - **精确一次保证**:如何确保每条记录精确写入一次,而不是零次或多次? - **事务一致性**:如何在多个并行写入器之间原子性地提交写入操作? - **容错**:如何从失败中恢复而不丢失数据或产生重复? - **反压**:如何处理慢速数据 Sink而不使系统过载? - **幂等性**:如何使重试操作安全? ### 1.2 设计目标 SeaTunnel 的数据 Sink 旨在: 1. **提供可验证的一致性语义**:在外部系统支持事务/幂等提交的前提下,通过两阶段提交与检查点边界实现端到端一致性 2. **支持并行写入**:通过多个写入器实例扩展吞吐量 3. **启用全局协调**:协调分布式写入器之间的提交 4. **确保容错**:从失败中恢复而不产生数据不一致 5. **提供灵活性**:支持各种提交策略 ### 1.3 适用场景 - 事务性数据库(JDBC 与 XA 事务) - 消息队列(Kafka 与事务) - 文件系统(原子文件重命名) - 数据湖(Iceberg、Hudi、Paimon 与表事务) - 搜索引擎(Elasticsearch 与版本控制) ## 2. 架构设计 ### 2.1 整体架构 ``` ┌────────────────────────────────────────────────────────────────┐ │ 执行引擎任务侧(数据面) │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkWriter │ │ │ │ │ │ │ │ • 从上游接收记录 │ │ │ │ • 缓冲并写入数据 │ │ │ │ • 在 checkpoint 边界产出 commitInfo │ │ │ │ • 快照写入器状态 │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ │ │ │ checkpoint 完成通知触发 │ │ ▼ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkCommitter(可选) │ │ │ │ │ │ │ │ • 使 prepare 的变更对外可见 │ │ │ │ • 失败可重试,要求幂等 │ │ │ └──────────────────────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────────────────────────┘ │ │ (可选:聚合提交任务,单实例) ▼ ┌────────────────────────────────────────────────────────────────┐ │ 执行引擎协调侧(控制面) │ │ │ │ ┌──────────────────────────────────────────────────────┐ │ │ │ SinkAggregatedCommitter(可选)│ │ │ │ │ │ │ │ • 聚合多个 writer 的 commitInfo │ │ │ │ • 执行一次全局提交(单线程语义) │ │ └──────────────────────────────────────────────────────┘ │ │ │ └────────────────────────────────────────────────────────────────┘ │ ▼ 外部数据系统 (数据库 / 文件 / 消息队列) ``` ### 2.2 核心组件 #### SeaTunnelSink(工厂接口) 作为创建写入器和提交器的工厂的顶层接口。 **契约要点(概念级)**: - 创建 writer:在工作节点(Task)侧创建 `SinkWriter`,负责接收记录并写入 - 恢复 writer:在 failover 后用 checkpoint 中的 writerState 恢复未完成写入 - 创建 committer(可选):当数据 Sink 需要两阶段提交时使用。它负责在 checkpoint 成功后提交 `prepareCommit(checkpointId)` 产生的提交信息;运行位置取决于执行引擎实现(例如在 SeaTunnel Engine 中由 Sink 任务在 `notifyCheckpointComplete` 回调中触发) - 创建 aggregated committer(可选):当外部系统需要“全局单点提交”(如表级提交/单次元数据提交)时使用。该提交器按单线程语义执行,通常与 committer 二选一;如果同时提供两者,需要确保语义不会重复提交/发生冲突 - 描述写入 schema:通过 `CatalogTable` 明确输入字段、投影与类型约束 这组工厂方法的核心目的是把“写入(数据面)”与“提交(控制面)”解耦,使得 checkpoint 成为全局一致性边界。 **关键设计点**: - 两阶段提交扩展点:写入器(必需)+(committer 或 aggregated committer,按需求选择) - committer 与 aggregated committer 在很多场景下应视为互斥选项:前者提交每个 writer 的变更,后者先聚合再做一次全局提交 - 写入器始终是必需的(执行实际的数据写入) ### 2.3 交互流程 #### 正常写入流程(带两阶段提交) ```mermaid sequenceDiagram participant CP as 框架(Checkpoint/回调) participant Writer1 as SinkWriter 1 participant Writer2 as SinkWriter 2 participant Committer as SinkCommitter participant Sink as 数据 Sink Writer1->>Writer1: write(record) Writer2->>Writer2: write(record) CP->>Writer1: triggerBarrier(checkpointId) CP->>Writer2: triggerBarrier(checkpointId) Writer1->>Writer1: prepareCommit(checkpointId) Writer1->>CP: ack(commitInfo1) Writer2->>Writer2: prepareCommit(checkpointId) Writer2->>CP: ack(commitInfo2) CP->>CP: 所有写入器已确认 CP->>CP: 持久化检查点 Note over CP,Committer: checkpoint 成功后,框架触发提交(触发点/运行位置取决于执行引擎实现) CP->>Committer: commit([commitInfo1, commitInfo2]) Committer->>Sink: 提交 writer1 的变更 Committer->>Sink: 提交 writer2 的变更 Committer->>CP: ack() CP->>Writer1: notifyCheckpointComplete(checkpointId) CP->>Writer2: notifyCheckpointComplete(checkpointId) ``` #### 失败和重试流程 ```mermaid sequenceDiagram participant CP as 框架(Checkpoint/回调) participant Writer as SinkWriter participant Committer as SinkCommitter participant Sink as 数据 Sink Note over Writer: 写入进行中(事务/临时文件) CP->>Writer: triggerBarrier(checkpointId) Writer->>Writer: prepareCommit(checkpointId) Writer->>CP: ack(commitInfo) alt Checkpoint 成功 Note over CP,Committer: checkpoint 成功后,框架/引擎会触发提交(触发点/运行位置取决于执行引擎实现) CP->>Committer: commit([commitInfo]) Committer->>Sink: 提交变更(幂等) Committer->>CP: ack() CP->>Writer: notifyCheckpointComplete(checkpointId) else Checkpoint 失败/中止 CP->>Writer: notifyCheckpointAborted(checkpointId) Note over Writer,Committer: 引擎可选调用 commit/abort 相关回调进行清理;\n务必保证 commit 幂等,避免只依赖 abort 完成回滚 end Note over Committer: commit 失败由框架重试\n必须保证幂等 ``` **核心职责**: - `write(element)`:接收上游记录并写入外部系统的“临时/事务内”区域(避免对外可见) - `prepareCommit(checkpointId)`:在 checkpoint 边界生成提交信息(commitInfo),要求“无副作用”(不让数据对外可见) - `snapshotState(checkpointId)`:把“已写入但未提交”的可恢复状态写入 checkpoint(事务句柄、文件清单、位点等) - `abortPrepare()`:用于回滚 `prepareCommit` 阶段产生的副作用(是否会被调用取决于执行引擎/实现路径) - `notifyCheckpointAborted()`:checkpoint 失败/中止回调(若 writer 或运行时实现了 CheckpointListener,可在此做清理) - `notifyCheckpointComplete()`:checkpoint 成功且提交完成后做清理(释放事务、删除临时文件/状态等) **关键要求**: - `prepareCommit(...)` 必须无副作用;真正让数据对外可见的动作应发生在 committer 的 `commit()` 阶段 - `snapshotState()` 必须覆盖所有“已写入但未提交”的中间结果,否则恢复会丢数据或重复写 - 清理路径必须可重试且幂等:同一 checkpoint 的 abort/cleanup 可能被调用多次 **典型实现形态(不绑定具体源码)**: - 事务型数据 Sink :writer 在事务内写入,prepare 阶段产出事务句柄/提交 token,commit 阶段统一提交 - 文件型数据 Sink :writer 写临时文件并产出“文件清单/元数据”,commit 阶段做原子 rename/元数据提交 ### 3.2 SinkCommitter 接口 提交器由执行引擎在 checkpoint 成功后触发执行,用于使本次 checkpoint 对应的“准备写入”对外可见(运行位置取决于具体执行引擎实现)。 **契约要点**: - `commit(commitInfos)`:对一批提交信息执行提交;必须支持重试,因此要求幂等 - 返回值语义:返回“仍需重试/未完成”的提交信息集合(框架会在后续 checkpoint 或恢复路径中重试) - `abort(commitInfos)`(可选):放弃提交并做资源清理(例如回滚事务、删除临时文件) **关键要求**: - `commit()` **必须**是幂等的(使用相同的 commitInfo 调用两次应该是安全的) - 返回**失败的** commitInfos 列表(将被重试) - 应优雅地处理部分失败 **实现提示**: - 需要明确幂等键(例如事务 id、文件清单版本、外部系统的去重 key) - 需要能区分“可重试失败”(网络抖动)与“不可重试失败”(权限/数据非法),避免无意义重试 ### 3.3 SinkAggregatedCommitter 接口 聚合提交器为所有写入器执行单个全局提交。 **契约要点**: - `combine(commitInfos)`:把多个 writer 的提交信息聚合成“全局一次提交”所需的元数据 - `commit(aggregatedCommitInfos)`:对聚合后的信息做全局提交;同样必须幂等 - `restoreCommit(...)`:恢复聚合提交器状态,确保 failover 后仍可完成/重试“全局提交” **使用场景**: - Hive 表提交(所有分区的单个 COMMIT TRANSACTION) - Iceberg 表提交(单个表快照) - 全局索引更新(为所有写入更新一次索引) **实现示例(语义级,以 Hive 为例)**: - `combine`: Sink 总所有 writer 产生的文件/分区元数据,形成一次表级提交所需的“全量变更集” - `commit`:对外部 metastore/表事务执行一次全局原子提交;失败后需要可重试且不重复(幂等) ## 4. 设计考量 ### 4.1 设计权衡 #### 两阶段提交 **优点**: - 强一致性保证(精确一次) - 自动失败恢复 - 准备和提交之间的清晰分离 **缺点**: - 增加延迟(数据仅在提交后可见) - 需要数据 Sink 中的事务支持 - 提交信息的额外状态 - 更复杂的实现 **何时使用**: - 金融交易、计费、审计日志 - 外部系统支持事务/幂等提交,并且业务需要端到端精确一次的场景 **何时不使用**: - 至少一次可接受(日志、指标) - 数据 Sink 不支持事务 - 需要超低延迟 #### 两层提交 vs 聚合提交 **两层(写入器 → 提交器)**: - 每个写入器的提交独立处理 - 并行提交操作 - 适用于大多数数据 Sink **聚合提交(写入器 → 聚合提交器)**: - 所有写入器的提交信息先被聚合 - 执行一次全局提交操作(单线程语义) - 适用于需要“单点表级提交/元数据提交”的外部系统(Hive、Iceberg 等) ### 4.2 性能考量 #### 批量写入 将多条记录合并为一次外部写入(JDBC batch / bulk API / multi-put)。 **好处**: - 摊销每条记录的开销 - 减少网络往返 - 更好的吞吐量 #### 异步写入 将外部 I/O 下沉到后台线程/异步客户端,以降低 `write()` 的尾延迟。但需要明确: - 如果采用异步写入,`prepareCommit(...)` 需要等待所有“已接收记录”的异步写入完成,才能生成可靠的 commitInfo - 需要有背压/限流策略,避免异步积压导致 OOM #### 连接池 对 JDBC/HTTP 等短连接成本高的外部系统,优先使用连接池/长连接以减少握手与认证开销。 ### 4.3 幂等性模式 #### 1. 自然幂等性(Upsert) 利用外部系统提供的 Upsert/Merge 语义,使“重复提交同一业务键”不会产生重复数据。 #### 2. 去重键 为每条写入生成可重复的幂等键(业务主键、事件 id、事务 id),并让外部系统/协议基于该键实现去重。 #### 3. 外部去重表 在外部系统维护“已提交记录表/去重索引”,提交前先检查是否已提交;这种方式通用但会引入额外写放大与一致性成本。 ## 5. 最佳实践 ### 5.1 使用建议 **1. 选择适当的提交级别** - 仅 writer:适合至少一次(数据写入立即可见,恢复会重放,需外部幂等) - writer + committer:适合两阶段提交(checkpoint 边界产出 commitInfo,并在 checkpoint 成功后触发 commit;触发位置取决于执行引擎实现) - writer + aggregated committer:适合表级事务/全局单点提交(先聚合多个 writer 的 commitInfo,再执行一次全局提交) **2. 正确的状态管理** - 状态里只放“恢复必需信息”(事务句柄/临时文件清单/最后一致性偏移量等),避免把大批数据放进状态 - 恢复时要能把状态回放到 writer 内部,并确保 prepare/commit 的幂等性仍成立 **3. 资源管理** - 明确资源生命周期:writer/committer 的 `close()` 必须可重复调用且不抛出不可恢复异常 - 尽量做到“按创建逆序关闭”,并确保失败时也能释放外部资源(连接/事务/临时文件) ### 5.2 常见陷阱 **1. prepareCommit(...) 中的副作用** - `prepareCommit(...)` 只能生成“提交所需的凭据/元数据”,不能让数据对外可见 - 一旦在 prepare 阶段产生外部副作用,failover 重放会导致重复写入 **2. 非幂等提交** - `commit()` 需要支持相同 commitInfo 的重复调用(网络抖动/主节点重启会发生) - 优先依赖外部系统的幂等语义(upsert/merge/幂等事务 id),否则需要自建去重机制 **3. 大状态** - 避免把大量缓冲记录放进 checkpoint 状态,状态越大越容易导致 checkpoint 超时与恢复变慢 - 把大数据留在外部系统(临时文件/事务日志),状态里只保留引用与必要元数据 ### 5.3 调试技巧 **1. 启用 XA 事务日志** - 记录关键生命周期事件:事务开始/prepare/commit/rollback、checkpointId、writerIndex - 避免记录敏感数据(凭据/明文 SQL/用户数据),以可追踪的事务 id 为主 **2. 跟踪提交进度** - 输出/采集提交指标:提交耗时、失败率、重试次数、单次提交大小 - 重点关注“提交堆积”与“commitInfo 重试风暴”,它们通常意味着幂等设计或外部系统稳定性问题 **3. 测试失败场景** - 覆盖典型故障:writer 崩溃、committer 崩溃、commit 超时、重复提交、checkpoint 超时 - 验证点:不丢数据、不重复可见(或重复可见但幂等)、恢复后可继续推进 checkpoint ## 6. 相关资源 - [架构概览](../overview.md) - [设计理念](../design-philosophy.md) - [数据源架构](source-architecture.md) - [检查点机制](../fault-tolerance/checkpoint-mechanism.md) - [精确一次语义](../fault-tolerance/exactly-once.md) ## 7. 参考资料 ### 示例连接器 - **简单数据 Sink **:ConsoleSink(输出到标准输出) - **文件数据 Sink **:FileSink(原子文件重命名) - **数据库数据 Sink **:JdbcSink(XA 事务) - **流式数据 Sink **:KafkaSink(Kafka 事务) - **表数据 Sink **:IcebergSink(表提交) ### 进一步阅读 - [两阶段提交协议](https://en.wikipedia.org/wiki/Two-phase_commit_protocol) - [XA 事务](https://www.oracle.com/java/technologies/xa-transactions.html) - [Kafka 事务](https://kafka.apache.org/documentation/#semantics) - [Iceberg 表格式](https://iceberg.apache.org/spec/) ================================================ FILE: docs/zh/architecture/api-design/source-architecture.md ================================================ --- sidebar_position: 2 title: 数据读取端 Source 架构 --- # 数据 Source 端架构 ## 1. 概述 ### 1.1 问题背景 分布式系统中的数据源读取端面临几个挑战: - **并行度**:如何从单个 Sink 并行读取数据? - **容错**:失败后如何从中断处恢复? - **动态分配**:如何处理工作节点失败并重新分配工作? - **有界 vs 无界**:如何统一批处理和流式数据源? - **反压**:如何处理下游处理缓慢的情况? ### 1.2 设计目标 SeaTunnel 的源端 Source 端读取 API 旨在: 1. **启用并行读取**:通过基于分片的并行度支持可扩展性 2. **确保容错**:检查点分片状态以实现精确一次处理 3. **分离协调与执行**:枚举器(主节点)和读取器(工作节点)分离 4. **支持动态分配**:在失败或不平衡时重新分配分片 5. **统一批处理和流处理**:有界和无界数据源的单一 API ### 1.3 适用场景 - 基于文件的数据源(本地文件、HDFS、S3、OSS)等 - 数据库数据源(MySQL、PostgreSQL、Oracle、JDBC 兼容)等 - 消息队列数据源(Kafka、Pulsar、RabbitMQ)等 - CDC 数据源(MySQL CDC、PostgreSQL CDC、Oracle CDC)等 - 流式数据源(Socket、HTTP、自定义协议)等 ## 2. 架构设计 ### 2.1 整体架构 ``` ┌──────────────────────────────────────────────────────────────┐ │ 协调端(master/coordinator 侧) │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ SourceSplitEnumerator │ │ │ │ │ │ │ │ • 在 run() 中发现/生成分片(实现自定义) │ │ │ │ • 分配分片给读取器 │ │ │ │ • 处理读取器注册 │ │ │ │ • 处理分片请求 │ │ │ │ • 从失败的读取器回收分片 │ │ │ │ • 快照枚举器状态 │ │ │ │ • 发送/接收自定义事件 │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ └────────────────────────────┼───────────────────────────────────┘ │ (分片分配) ▼ ┌──────────────────────────────────────────────────────────────┐ │ TaskExecutionService(工作节点侧) │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ SourceReader │ │ │ │ │ │ │ │ • 接收分配的分片 │ │ │ │ • 从分片读取数据 │ │ │ │ • 向下游发送记录 │ │ │ │ • 快照读取器状态(分片进度) │ │ │ │ • 处理分片完成 │ │ │ │ • 发送/接收自定义事件 │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ └────────────────────────────┼─────────────────────────────────┘ │ ▼ SeaTunnelRow (到转换/数据 Sink ) ``` ### 2.2 核心组件 #### SeaTunnelSource(工厂接口) 作为创建读取器和枚举器的工厂的顶层接口。 本节仅保留核心契约说明,完整签名以源码为准: **关键契约**: - `getBoundedness()`:声明 BOUNDED/UNBOUNDED - `createReader()`:创建运行在工作节点侧的 `SourceReader` - `createEnumerator()` / `restoreEnumerator()`:创建/恢复运行在主节点侧的 `SourceSplitEnumerator` - `getProducedCatalogTables()`:声明输出的表元数据(`CatalogTable` 列表,支持多表/模式信息) - `getSplitSerializer()` / `getEnumeratorStateSerializer()`:split/枚举器状态序列化器(用于网络传输与 checkpoint) #### SourceSplit(最小可序列化单元) 表示数据的可分区单元。 **核心约束**: - **可独立处理**:split 表达一个可被单个 reader 独立读取的范围(例如文件片段、分区、主键范围)。 - **可序列化传输**:split 需要能在主节点与工作节点之间传递。 - **可重分配**:reader 失败时,未完成 split 必须可回收并分配给其他 reader。 **实现示例**: - 文件类:`(filePath, startOffset, length)` 或 “单文件一个 split” - JDBC 类:`(queryRange / shardKeyRange / partition)` - Kafka 类:`(topic, partition, startOffset)` **设计说明**: - 分片必须可序列化以进行网络传输 - 分片状态(例如,当前偏移量)单独存储在读取器状态中 - 分片可以重新分配给不同的读取器 ### 2.3 交互流程 #### 初始启动流程 ```mermaid sequenceDiagram participant Coord as 框架(协调端) participant Enum as SourceSplitEnumerator participant Worker as TaskExecutionService participant Reader as SourceReader Coord->>Enum: createEnumerator(context) Enum->>Enum: open() Enum->>Enum: run()\n(内部完成分片发现/生成) Worker->>Reader: createReader(context) Coord->>Enum: registerReader(subtaskId) Reader->>Reader: context.sendSplitRequest() Enum->>Enum: handleSplitRequest(subtaskId) Enum->>Reader: assignSplit(splits) Reader->>Reader: addSplits(splits) Reader->>Reader: pollNext(collector) Reader->>Worker: collect(record) ``` #### 检查点流程 ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Enum as SourceSplitEnumerator participant Reader as SourceReader CP->>Reader: triggerBarrier(checkpointId) Reader->>Reader: snapshotState(checkpointId) Reader->>CP: ack(readerState) CP->>Enum: snapshotState(checkpointId) Enum->>Enum: 快照枚举器状态 Enum->>CP: ack(enumeratorState) CP->>CP: 收到所有确认 CP->>CP: 持久化检查点 ``` #### 失败恢复流程 ```mermaid sequenceDiagram participant Coord as 框架(协调端) participant Enum as SourceSplitEnumerator participant OldReader as 失败的读取器 participant NewReader as 新读取器 OldReader->>OldReader: [失败] Coord->>Enum: addSplitsBack(失败读取器的分片) Enum->>Enum: 标记分片为待处理 Coord->>NewReader: 在新工作节点上部署 NewReader->>NewReader: restoreState(checkpointedState) Coord->>Enum: registerReader(subtaskId) Enum->>NewReader: assignSplit(恢复的分片) NewReader->>NewReader: 从检查点偏移量恢复 ``` ## 3. 关键实现 ### 3.1 SourceSplitEnumerator 接口 枚举器在主节点侧运行并协调分片分配。 **关键契约(摘要)**: - `run()`:枚举/发现分片并驱动分配逻辑 - `registerReader(subtaskId)`:注册 reader(由引擎调用) - `handleSplitRequest(subtaskId)`:处理 reader 请求分片 - `addSplitsBack(splits, subtaskId)`:reader 失败时回收未完成分片 - `snapshotState(checkpointId)`:快照枚举器状态(注意与 `run()` 的并发调用约束) **关键职责**: - **分片发现**:从数据源生成分片(文件、分区、分片) - **分配策略**:决定哪些分片分配给哪些读取器 - **动态处理**:处理读取器注册、分片请求、失败 - **状态管理**:快照剩余分片和分配状态 **典型实现思路(伪代码示意)**: ``` on run(): pendingSplits += newlyDiscoveredSplits # 分片发现/生成逻辑由实现决定 on handleSplitRequest(subtaskId): if pendingSplits not empty: assignSplit(subtaskId, nextSplit) else: signalNoMoreSplits(subtaskId) on addSplitsBack(splits): pendingSplits += splits ``` ### 3.2 SourceReader 接口 读取器在工作节点上运行并执行实际的数据读取。 **关键契约(摘要)**: - `pollNext(output)`:拉取下一批数据(建议非阻塞/可限时) - `addSplits(splits)`:接收枚举器分配的 splits - `snapshotState(checkpointId)`:返回 split checkpoint state(实际接口返回 `List`) - `handleNoMoreSplits()`:收到无更多 split 的信号 - `CheckpointListener` 回调:由框架触发 checkpoint 完成/中止通知 **关键职责**: - **数据读取**:从分配的分片拉取记录 - **进度跟踪**:跟踪每个分片内的偏移量/位置 - **状态管理**:快照分片进度以进行恢复 - **分片管理**:处理分片分配、完成和删除 **典型实现思路(伪代码示意)**: ``` pollNext(output): if no active split: request split if queue empty else activate next split read batch records from active split into output snapshotState(checkpointId): return remaining/unconsumed splits (and progress via split内部状态或外部offset映射) ``` ### 3.3 SourceEvent(自定义通信) 允许枚举器和读取器交换自定义消息。 **核心约束**:事件需可序列化,用于 `SourceReader` 与 `SourceSplitEnumerator` 之间的自定义通信。 **使用场景**: - 动态分区发现(Kafka、HDFS) - 运行时配置更改 - 自定义协调逻辑 ## 4. 设计考量 ### 4.1 设计权衡 #### 枚举器-读取器分离 **优点**: - 清晰分离协调(主节点)和执行(工作节点) - 枚举器可以在读取器不知情的情况下重新分配分片 - 集中协调简化分片分配逻辑 - 容错:枚举器和读取器独立失败 **缺点**: - 额外的网络通信(分片分配消息) - 连接器开发人员的 API 更复杂 - 如果枚举器速度慢,可能成为瓶颈 **缓解措施**: - 异步分片分配 - 批量分片请求/分配 - 延迟分片发现 #### 分片粒度 **粗粒度分片**(少量大分片): - **优点**:较少的协调开销 - **缺点**:负载均衡差,恢复时间长 **细粒度分片**(许多小分片): - **优点**:更好的负载均衡,更快的恢复 - **缺点**:更高的协调开销 **经验建议(仅供参考)**:按数据源特性与作业目标在“负载均衡/协调开销/恢复耗时”之间权衡分片粒度;不要在文档里把某个固定大小当作必然最佳值。 ### 4.2 性能考量 #### 批量读取 建议批量读取而不是逐条读取,以摊销 I/O 与序列化开销。 **好处**: - 摊销每条记录的开销 - 更好的 CPU 缓存利用率 - 减少锁竞争 #### 非阻塞轮询 建议在无可用数据时快速返回,由框架按调度节奏再次调用,避免阻塞工作线程。 **好处**: - 避免阻塞工作线程 - 启用反压处理 - 更好的资源利用率 #### 连接池 数据库类 Source 建议使用连接池并控制并发连接数,避免对源端造成压垮式压力。 ### 4.3 可扩展性 #### 自定义分片分配策略 自定义分配策略应基于可观测信号(负载、数据局部性、split 大小差异)并确保失败回收路径可用。 典型策略包括:按 split 大小做负载均衡、按数据局部性优先分配、对热点 reader 做节流等。 #### 动态分片发现 动态分片发现通常用于“分区会随时间变化”的数据源(如 Kafka、目录新增文件等)。推荐的设计方式是: 1. **周期性发现**:枚举器按固定周期扫描新分区/新文件,并将其转换为新的 split。 2. **增量分配**:新 split 作为增量加入待分配队列,由分配策略按负载分发给 reader。 3. **一致性边界**:对“发现时点”与“开始消费时点”的关系做明确约束(例如:从发现时刻开始消费;或支持从指定 offset/时间戳消费)。 4. **与 checkpoint 的关系**:必须确保“新 split 的出现”在故障恢复后可重放(通过枚举器状态快照或外部可重复发现的元数据源实现)。 ## 5. 最佳实践 ### 5.1 使用建议 **1. 分片大小** - 文件:按文件系统与下游吞吐能力合理切分(例如按 block/文件/分区等天然边界) - 数据库:按分片键范围/分页区间/分区等可独立读取的边界切分 - 消息队列:通常使用原生分区(如 Kafka 分区)作为 split 边界 **2. 状态管理** - 保持分片状态小(每个分片 < 1MB) - 使用偏移量/位置而不是缓冲数据 - 高效序列化(Kryo、Protobuf) **3. 错误处理** 建议将错误分为两类并采用不同策略: - **瞬态错误**(网络抖动、临时超时、可重试的限流):允许有限次数重试,并使用退避策略(exponential backoff + jitter),同时把重试次数/最后错误输出到指标与日志。 - **致命错误**(配置错误、权限不足、协议不兼容、数据不可解析且无法跳过):应快速失败并把异常向框架上抛,触发作业失败或按作业级策略处理。 注意事项: - 避免在工作线程里进行长时间 sleep;如果必须退避,优先采用非阻塞式调度或由框架驱动下一次 poll。 - 对“可跳过的坏数据”要显式配置并记录(计数、采样、落盘/死信),默认不建议静默吞掉。 **4. 资源管理** 资源管理建议: - 对所有外部资源(连接、游标/ResultSet、文件句柄、线程池、缓冲区)建立“创建-使用-关闭”的明确生命周期,并保证 close 在异常路径也能执行。 - 优先使用连接池并设置上限,避免并发 reader 放大源端压力。 - 释放顺序建议与依赖关系一致(先游标/会话,后连接/池)。 ### 5.2 常见陷阱 **1. 阻塞 pollNext()** 反例:在 `pollNext()` 中无限期阻塞(例如等待队列/网络直到有数据),会占用工作线程并破坏框架调度。 推荐: - 使用非阻塞或有超时的轮询,没数据时快速返回,让框架按节奏再次调用。 - 把“等待数据”的职责交给外部组件(如有界队列 + 生产线程),但 reader 侧仍应遵循非阻塞/可中断原则。 **2. 大状态** 反例:把整段数据缓冲进 checkpoint state,会导致状态膨胀、checkpoint 变慢、恢复时间不可控。 推荐: - 状态只保存“可重放位置”(offset、游标位置、文件 path+position、分区+时间戳等)。 - 把缓存留在内存并可丢弃,让恢复依赖可重复读取(replay)而不是依赖大状态。 **3. 忘记请求分片** 反例:当本地没有可读 split 时直接返回,且没有向框架请求更多 split,会导致 reader 长期空转。 推荐: - 当待处理 split 为空时,主动触发 split request(或进入“等待分片”的可调度状态)。 - 同时输出指标(例如 pending split 数、空轮询次数),便于发现枚举器未分配/分配失衡问题。 ### 5.3 调试技巧 **1. 启用调试日志** 建议输出“可定位”的调试日志(并可按配置开关): - 当前 split 标识、消费位置(offset/position)、批大小 - 上次 checkpoint 的 id/时间 - 最近一次错误类型与重试次数 **2. 跟踪指标** 建议最少暴露以下指标,便于容量规划与排障: - 吞吐:records/s、bytes/s - 延迟:端到端 lag(按时间戳/offset) - backlog:待处理 split 数、每个 split 的剩余量 - 可靠性:重试次数、失败次数、坏数据计数 **3. 测试分片重新分配** 建议用“故障注入”的方式验证 split 回收与再分配: - reader 异常退出/超时心跳 -> enumerator 回收其已分配但未完成的 splits - 新 reader 加入 -> 能重新领取并从正确位置继续消费 - 验证点:无重复消费(或重复可被幂等吸收)、无数据丢失、恢复耗时可接受 ## 6. 相关资源 - [架构概览](../overview.md) - [设计理念](../design-philosophy.md) - [数据 Sink 架构](sink-architecture.md) - [检查点机制](../fault-tolerance/checkpoint-mechanism.md) - [如何创建您的连接器](../../developer/how-to-create-your-connector.md) ## 7. 参考资料 ### 示例连接器 - **简单数据源**:FakeSource(生成测试数据) - **文件数据源**:FileSource(本地/HDFS/S3 文件) - **数据库数据源**:JdbcSource(JDBC 兼容数据库) - **流式数据源**:KafkaSource(Apache Kafka) - **CDC 数据源**:MySQLCDCSource(MySQL binlog) ### 进一步阅读 - Apache Flink FLIP-27:["Refactored Source API"](https://cwiki.apache.org/confluence/display/FLINK/FLIP-27%3A+Refactor+Source+Interface) - Kafka Consumer:[Consumer Groups and Partition Assignment](https://kafka.apache.org/documentation/#consumerconfigs) ================================================ FILE: docs/zh/architecture/api-design/translation-layer.md ================================================ --- sidebar_position: 1 title: 转换层 --- # 转换层架构 ## 1. 概述 ### 1.1 问题背景 SeaTunnel 提供统一的连接器 API,但作业需要在不同的执行引擎上运行: - **引擎多样性**: Flink、Spark、SeaTunnel Engine (Zeta) 具有不同的 API - **代码重复**: 没有转换,每个连接器需要 3 个实现 - **维护负担**: Bug 修复需要在所有实现中进行更改 - **API 演化**: 引擎 API 变更会破坏连接器 - **用户体验**: 用户希望跨引擎的一致行为 ### 1.2 设计目标 SeaTunnel 的转换层旨在: 1. **实现可移植性**: 相同的连接器可在任何引擎上运行 2. **隐藏复杂性**: 连接器开发者只需学习 SeaTunnel API 3. **保持保真度**: 跨引擎保留语义保证 4. **最小化开销**: 尽量降低转换对吞吐/延迟的影响(取决于 connector、类型转换与引擎实现) 5. **支持演化**: 将连接器与引擎 API 变更隔离 ### 1.3 架构概览 ``` ┌──────────────────────────────────────────────────────────────┐ │ SeaTunnel API 层 │ │ (引擎独立的连接器接口) │ │ │ │ SeaTunnelSource SeaTunnelSink SeaTunnelTransform │ └──────────────────────────────────────────────────────────────┘ │ │ 转换层 ┌─────────────┼─────────────┐ ▼ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ Flink 适配器 │ │ Spark 适配器 │ │ Zeta (原生) │ │ │ │ │ │ │ │ FlinkSource │ │ SparkSource │ │ 直接 │ │ FlinkSink │ │ SparkSink │ │ 执行 │ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ │ │ ▼ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ Apache Flink │ │ Apache Spark │ │ SeaTunnel Engine │ │ 运行时 │ │ 运行时 │ │ (Zeta) │ └──────────────────┘ └──────────────────┘ └──────────────────┘ ``` ## 2. Flink 转换层 ### 2.1 FlinkSource 适配器 将 `SeaTunnelSource` 适配到 Flink 的 `Source` 接口。 **适配点(语义级)**: - **有界/无界语义**:把 SeaTunnel 的 boundedness 映射到 Flink 的 `Boundedness` - **Reader 创建**:把 Flink `SourceReaderContext` 适配为 SeaTunnel reader context,并用 wrapper 把 SeaTunnel reader 包装成 Flink reader - **Enumerator 创建**:把 Flink `SplitEnumeratorContext` 适配为 SeaTunnel enumerator context,并包装成 Flink enumerator - **序列化器**:把 SeaTunnel 的 split/state 序列化器适配到 Flink 的 `SimpleVersionedSerializer` ### 2.2 FlinkSourceReader 适配器 **适配点(语义级)**: - `start/open`:把 Flink 的 reader 生命周期委托给 SeaTunnel reader - `pollNext`:把 Flink `ReaderOutput` 适配为 SeaTunnel collector,并映射“有无数据可读”的返回语义 - `addSplits`:把 Flink 的 split wrapper 解包为 SeaTunnel split 再下发 - `snapshotState`:把 SeaTunnel reader 的快照结果包装为 Flink 侧可序列化的 split/state - `notifyCheckpointComplete`:把 checkpoint 完成通知下沉到 SeaTunnel reader(用于清理/提交等) ### 2.3 FlinkSourceEnumerator 适配器 **适配点(语义级)**: - 生命周期:Flink enumerator 的 `start` 驱动 SeaTunnel enumerator 的 open/run - 分片请求:Flink 的 split request 透传给 SeaTunnel enumerator 的分片分配逻辑 - 分片回退:把回退 split 解包并回交给 SeaTunnel enumerator - 状态快照:把 enumerator state 包装成 Flink 可持久化的 wrapper,以参与 checkpoint ### 2.4 上下文适配器 **FlinkSourceReaderContext**: - 下标与并行度:把 Flink 的 subtask index 映射为 SeaTunnel reader 的 index - 事件通道:把 SeaTunnel 的 SourceEvent 包装后发送到 Flink 的 coordinator/event channel - 分片请求:Flink 会在运行时自动触发 split request,SeaTunnel 侧通常不需要显式触发 **FlinkSourceSplitEnumeratorContext**: - 并行度/注册 reader:把 Flink 的 runtime 信息暴露给 SeaTunnel enumerator - 分片分配:把 SeaTunnel split 包装为 Flink split 并通过 Flink 的 assignment API 下发 - no-more-splits:在有界场景下通知 reader 结束 - 事件下发:把 SeaTunnel event 包装为 Flink event 并发送给指定 reader ### 2.5 FlinkSink 适配器 **适配点(语义级)**: - writer:把 Flink `InitContext` 适配为 SeaTunnel writer context 并创建 SeaTunnel `SinkWriter` - committer/global committer:把 SeaTunnel 的两阶段提交组件包装为 Flink 的 committer 体系 - serializer:把 SeaTunnel 的 commitInfo / writerState 序列化器适配为 Flink `SimpleVersionedSerializer` ### 2.6 FlinkSinkWriter 适配器 **适配点(语义级)**: - `write`:把 Flink sink writer 的写入请求委托给 SeaTunnel `SinkWriter.write` - `prepareCommit`:把 SeaTunnel `prepareCommit()` 的可选 commitInfo 映射为 Flink 的 committable 列表 - `snapshotState`:直接使用 SeaTunnel writer 的快照结果参与 Flink checkpoint - `close`:委托关闭,确保释放外部资源 ## 3. Spark 转换层 ### 3.1 SparkSource 适配器 将 `SeaTunnelSource` 适配到 Spark 的数据源接口(Spark 2.4 与 Spark 3.x 使用的 DataSource API 形态不同,具体以对应版本适配模块实现为准)。 **适配点(语义级)**: - `readSchema`:把 SeaTunnel `CatalogTable/TableSchema` 映射为 Spark `StructType` - `planInputPartitions`:在 Spark 的批处理模型下,通常一次性生成全部 splits,并为每个 split 构造一个 `InputPartition` Spark 的执行模型偏“批式规划”,因此枚举器的职责更像是“规划阶段生成分片集合”,而不是长期运行的调度器。 ### 3.2 SparkInputPartition **适配点(语义级)**: - 每个 `InputPartition` 绑定一个 SeaTunnel split - `createPartitionReader` 创建 SeaTunnel reader,注入该 split,并把输出转换为 Spark `InternalRow` ### 3.3 SparkPartitionReader **适配点(语义级)**: - 初始化:创建并打开 SeaTunnel reader,下发 split - 读取循环:从 SeaTunnel reader 拉取记录并转换为 Spark `InternalRow`(必要时使用缓冲队列适配 pull-based API) - 资源释放:关闭 reader 并释放外部资源 ### 3.4 SparkSink 适配器 **适配点(语义级)**: - writer factory:在 executor 侧创建写入器实例并接收 Spark `InternalRow` - commit coordinator:当目标端存在提交器时启用 Spark 的提交协调路径 - commit/abort:把 Spark 的提交消息转换为 SeaTunnel 的 commitInfo 列表,并交由 SeaTunnel `SinkCommitter` 执行(要求幂等/可重试) ## 4. 序列化适配器 ### 4.1 FlinkSimpleVersionedSerializer **适配点(语义级)**: - 版本:将 SeaTunnel serializer 的版本号透传到 Flink 侧 - 序列化/反序列化:直接委托给 SeaTunnel serializer,以保证跨引擎一致的状态编码 ## 5. 类型转换 ### 5.1 Spark 类型转换 **适配点(语义级)**: - Schema:将 SeaTunnel `TableSchema` 映射为 Spark `StructType` - DataType:按 `SqlType` 做一一映射(整数/浮点/decimal/string/boolean/date/timestamp/bytes/array/map 等) - 兼容性:当引擎侧类型更细分时(例如 timestamp 语义差异),以 SeaTunnel 的“最小公分母”语义为准,并允许通过配置选择具体映射策略 ## 6. 性能考虑 ### 6.1 转换开销 转换层带来的开销主要来自上下文包装、类型转换、序列化/反序列化等。实际开销高度依赖具体 connector 的 I/O 特性与数据类型分布,因此本文不提供固定比例或吞吐数字,避免与真实环境产生偏差。 ### 6.2 优化技术 **批量类型转换**: - 优先批量转换(向量化/批处理)以摊销 per-row 转换成本 - 在不改变语义的前提下减少对象创建与复制(降低 GC 压力) **避免不必要的包装**: - 优先复用已有序列化能力,避免重复 wrapper 造成的额外拷贝 - 在必须 wrapper 时采用惰性策略:仅在 checkpoint/网络传输时做包装 ## 7. 限制和解决方法 ### 7.1 引擎特定功能 **问题**: 某些引擎功能在 SeaTunnel 中没有等效项。 **示例**: Flink 的 `WatermarkStrategy` Flink 的 watermark/事件时间语义属于引擎特性,SeaTunnel 的连接器 API 默认不直接暴露该能力。 **解决方法**: 提供引擎特定配置 ```hocon source { Kafka { # SeaTunnel 配置 topic = "my_topic" # 引擎特定配置(仅用于 Flink) flink.watermark.strategy = "bounded-out-of-orderness" flink.watermark.max-out-of-orderness = "5s" } } ``` ### 7.2 类型系统差异 **问题**: 类型系统不完全对齐。 **示例**: Spark 有 `TimestampType`,Flink 有 `LocalZonedTimestampType` 和 `TimestampType`。 **解决方法**: 使用最小公分母 SeaTunnel 侧使用统一抽象类型;转换层根据引擎能力与用户配置决定映射到哪一种引擎类型。 ## 8. 最佳实践 ### 8.1 连接器开发 **应该做的**: - 仅实现 SeaTunnel API - 在多个引擎上测试 - 使用 SeaTunnel 类型 **不应该做的**: - 在连接器代码中引用引擎特定 API - 假设特定引擎行为 - 使用引擎特定优化 ### 8.2 测试 **在所有引擎上测试**: - 建议使用参数化/矩阵测试:同一套连接器用例在 Flink/Spark/Zeta 上跑 - 覆盖语义一致性:exactly-once、checkpoint 恢复、schema 兼容、分片重新分配等 ## 9. 相关资源 - [数据 Source 架构](../api-design/source-architecture.md) - [目标端 Sink 架构](../api-design/sink-architecture.md) - [设计理念](../design-philosophy.md) ================================================ FILE: docs/zh/architecture/design-philosophy.md ================================================ --- sidebar_position: 2 title: 设计理念 --- # SeaTunnel 设计理念 ## 1. 概述 本文档阐述了塑造 SeaTunnel 架构的核心设计原则、理念和权衡。理解这些原则有助于贡献者做出一致的设计决策,并帮助用户了解系统的优势和局限性。 ## 2. 核心设计原则 ### 2.1 引擎独立性 **原则**:将连接器逻辑与执行引擎解耦。 **动机**: - 数据同步专用引擎 Zeta 出现之前,用户可能已有 Flink 或 Spark 集群 - 不同引擎适用于不同场景(批处理 vs 流处理、资源约束) - 连接器开发人员不应需要理解多个引擎 API **实现**: - 统一的 SeaTunnel API 层抽象引擎特定细节 - 转换层将 SeaTunnel API 适配到引擎特定 API - 连接器逻辑尽量与执行引擎解耦;在转换层支持的前提下,同一套连接器实现可复用到不同引擎(具体可用性以连接器能力与引擎支持为准) **权衡**: - **优点**:最大化可重用性 - 复用连接器逻辑,减少引擎适配重复开发 - **优点**:更简单的连接器开发 - 只需学习单一 API - **缺点**:无法利用引擎特定的优化 - **缺点**:额外的转换开销 - **缓解措施**:转换层轻薄且优化;大部分开销在 I/O 而非转换 **示例**:连接器仅实现 SeaTunnel API 的抽象(Source/Sink/Transform),不同执行引擎通过转换层完成适配;因此连接器逻辑与引擎 API 变更解耦。 ### 2.2 协调与执行分离 **原则**:将控制逻辑(协调)与数据处理(执行)分离。 **动机**: - 协调逻辑是单线程且轻量级的 - 执行逻辑是并行且资源密集的 - 容错需要为每个部分独立管理状态 **实现原理**: **协调层(Master 侧)**: - 运行位置:主节点,维护全局视图 - 核心职责:资源发现、工作分配、故障检测、状态协调 - 运行特点:单线程、轻量级、不处理实际数据 - 维护状态:分配计划、待处理工作单元、全局进度 **执行层(Worker 侧)**: - 运行位置:工作节点,独立并行执行 - 核心职责:本地数据处理、进度汇报、参与检查点 - 运行特点:多线程、资源密集、处理大量数据 - 维护状态:本地处理进度、缓冲数据、执行上下文 **通信机制**: - 协调层 → 执行层:通过事件分发工作(如:分配新的数据分片) - 执行层 → 协调层:通过消息汇报进度(如:完成分片、请求新工作) - 检查点时:各自快照自己的状态,互不干扰 **权衡**: - **优点**:清晰的关注点分离 - **优点**:枚举器可以在失败时重新分配分片 - **优点**:提交器实现全局事务协调 - **缺点**:额外的通信开销 - **缺点**:连接器开发人员的 API 更复杂 - **缓解措施**:合理的默认值;简单连接器可以使用简单的枚举器/提交器 **示例**: - 主节点侧:负责“发现/生成工作单元(split)+ 分配 + 回收 + 快照状态”。 - 工作节点侧:负责“执行读取/写入 + 汇报进度 + 参与 checkpoint”。 这样设计的关键原因是:容错需要区分“控制状态”(分配/待处理 split)和“执行进度”(每个 split 的 offset/position),才能在失败后做到精准恢复与快速重分配。 ### 2.3 基于分片的并行度 **原则**:将数据源划分为可独立处理的分片。 **动机**: - 实现无需紧密协调的并行处理 - 支持动态负载均衡和故障恢复 - 提供检查点粒度(每个分片的进度) **实现**: - 数据源划分为分片(文件块、DB 分区、Kafka 分区等) - 枚举器延迟或急切地生成分片 - 读取器独立处理分片 - 未处理的分片可以在失败时重新分配 **权衡**: - **优点**:出色的可扩展性 - 添加工作节点以处理更多分片 - **优点**:细粒度故障恢复 - 仅需要重新处理失败的分片 - **优点**:动态负载均衡 - 将更多分片分配给空闲的工作节点 - **缺点**:某些数据源的分片生成开销 - **缺点**:需要跟踪每个分片的状态 - **缓解措施**:延迟分片生成;分片状态轻量级 **示例**: - 数据库场景:split 通常表达“分片键范围/分页区间/分区”一类可独立读取的范围。 - 文件场景:split 通常表达“文件 + 起始偏移 + 长度”或“单文件”。 这里不展示具体结构体代码,重点在于 split 的边界:必须能被独立处理、可序列化传输、可在失败后重新分配。 ### 2.4 通过两阶段提交实现精确一次语义 **原则**:保证端到端精确一次数据传递。 **动机**: - 数据集成不能丢失或重复数据 - 失败可能在任何时候发生(网络、进程崩溃) - 外部系统需要事务保证 **实现原理**: 两阶段提交协议将数据写入过程分为两个独立阶段: 1. **准备阶段(Prepare Phase)**: - 时机:在检查点屏障到达时触发 - 动作:写入端生成"可提交但未提交"的凭证(如事务 ID、临时文件路径) - 约束:不对外部系统产生可见副作用(数据对外不可见) - 状态:凭证信息随检查点一起持久化 2. **提交阶段(Commit Phase)**: - 时机:检查点完整成功后 - 动作:协调端使用凭证信息原子性地提交变更(如提交事务、移动文件) - 效果:数据对外部系统可见 - 保证:幂等性,重复提交不产生副作用 3. **中止处理(Abort Handling)**: - 时机:检查点失败或超时 - 动作:清理准备阶段产生的临时资源(如回滚事务、删除临时文件) - 效果:保证不会产生部分写入或不一致状态 **权衡**: - **优点**:强一致性保证 - **优点**:自动从失败中恢复 - **缺点**:需要数据 Sink 中的事务支持(或幂等操作) - **缺点**:增加延迟(数据仅在提交后可见) - **缺点**:提交信息的额外状态 - **缓解措施**:可选特性;非事务性数据 Sink 可使用至少一次模式 **示例**:典型的 Exactly-Once 落地方式是“写入端先生成可提交凭证(commit info),checkpoint 成功后再由协调端执行最终提交”。这样做的原因是:把副作用(对外部系统的可见变更)延后到 checkpoint 成功之后,避免失败重启时产生重复可见写入。 ### 2.5 模式作为一等公民 **原则**:将模式视为通过管道传播的显式、类型化的元数据。 **动机**: - 数据集成需要模式转换和验证 - 模式演化(DDL 变更)必须显式处理 - 类型不匹配应该尽早捕获 **实现**: - `CatalogTable` 封装完整的表元数据 - `TableSchema` 定义结构(列、主键、约束) - 模式通过数据源 → 转换 → 数据 Sink 传播 - `SchemaChangeEvent` 表示 DDL 变更(ADD/DROP/MODIFY 列) **权衡**: - **优点**:类型安全 - 在作业提交时验证模式 - **优点**:模式演化 - 在运行时处理 DDL 变更 - **优点**:更好的错误消息 - 尽早检测模式不匹配 - **缺点**:无模式数据源的额外复杂性 - **缺点**:某些数据源的模式发现开销 - **缓解措施**:模式推断助手;可选的模式覆盖 **示例**:数据源产出“显式模式”(列、主键、约束、分区、选项等),转换对模式进行验证与映射,数据 Sink 在接收端再次校验。这样做的原因是:把“类型不匹配/缺列/主键冲突”等问题尽早暴露在提交阶段,而不是让它们在运行时以隐式的脏数据形式出现。 ### 2.6 具有类加载器隔离的插件架构 **原则**:连接器是动态加载的插件,具有隔离的依赖。 **动机**: - 避免依赖冲突(例如,多个 JDBC 驱动程序版本) - 实现热插拔连接器,无需重新构建核心 - 减少核心分发大小 **实现**: - 用于连接器发现的 Java SPI - 每个连接器具有隔离的类加载器 - 遮蔽插件依赖以避免冲突 - 用于实例化的工厂模式 **权衡**: - **优点**:依赖隔离 - 无版本冲突 - **优点**:更小的核心分发 - **优点**:易于添加第三方连接器 - **缺点**:类加载器复杂性 - **缺点**:某些共享库(如 Guava)可能存在问题 - **缓解措施**:谨慎遮蔽;核心中的共享通用库 **示例**: ``` seatunnel-engine/lib/ # 核心库 connector-jdbc/lib/ # JDBC 驱动程序(隔离) connector-kafka/lib/ # Kafka 客户端(隔离) # 每个连接器由单独的 ClassLoader 加载 ConnectorClassLoader(connector-jdbc) -> 加载 mysql-connector-java-8.0.26.jar ConnectorClassLoader(connector-kafka) -> 加载 kafka-clients-3.0.0.jar ``` ### 2.7 具有检查点存储抽象的状态管理 **原则**:将状态管理与存储实现解耦。 **动机**: - 不同部署需要不同的存储(HDFS、S3、本地、OSS) - 状态大小差异很大(KB 到 TB) - 存储耐久性和性能要求不同 **实现**: - 可插拔 checkpoint storage(例如 localfile/hdfs 等,取决于插件与配置) - 状态的可插拔序列化 - 增量检查点支持 - 自动状态清理 **权衡**: - **优点**:灵活性 - 根据部署选择存储 - **优点**:增量检查点减少开销 - **缺点**:存储性能影响检查点延迟 - **缺点**:生产环境需要分布式文件系统 - **缓解措施**:异步检查点上传;可配置间隔 ### 2.8 多表同步 **原则**:支持在单个作业中同步多个表。 **动机**: - 数据库迁移通常涉及数百个表 - 为每个表创建一个作业浪费资源 - 模式演化必须应用于所有表 **实现**: - `MultiTableSource` / `MultiTableSink` 包装单个表数据源/Sink - `TablePath` 将记录路由到正确的表 - 按表传播模式变更 - 支持副本以提高吞吐量 **权衡**: - **优点**:资源效率 - 一个作业而不是数百个 - **优点**:跨表一致快照 - **优点**:集中监控 - **缺点**:一个表失败可能影响其他表 - **缺点**:更复杂的错误处理 - **缓解措施**:可配置的错误容忍度;按表的指标 ## 3. 架构权衡 ### 3.1 简单性 vs 性能 **选择**:优先考虑简单性和正确性而非极端性能优化。 **理由**: - 数据集成是 I/O 密集型的,而非 CPU 密集型 - 正确的语义(精确一次)比原始速度更关键 - 简单的代码易于维护和调试 **证据**: - 网络和磁盘 I/O 主导处理时间(> 90%) - 转换层开销可以忽略不计(< 1%) - 代码可读性优先(例如,清晰的状态机,无微观优化) ### 3.2 灵活性 vs 易用性 **选择**:提供合理的默认值,同时允许高级定制。 **理由**: - 大多数用户想要简单的配置 - 高级用户需要细粒度控制 - 两种需求可以通过分层 API 满足 **实现**: - 常见情况的高级配置(例如,`jdbc://host:port/db`) - 专家的低级选项(例如,连接池调优) - 合理的默认值(并行度、检查点间隔、缓冲区大小) ### 3.3 通用性 vs 专业化 **选择**:通用 API 与专业化实现。 **理由**: - 统一的 API 简化了学习和使用 - 不同的数据源具有独特的特征(有界 vs 无界、可分片性) - 专业化发生在连接器实现中,而非 API 中 **示例**: - `SourceSplitEnumerator` 足够通用,可用于文件、数据库和消息队列 - 文件连接器使用基于文件的分片 - Kafka 连接器使用基于分区的分片 - JDBC 连接器使用基于查询的分片 ### 3.4 强一致性 vs 延迟 **选择**:提供精确一次(高延迟)和至少一次(低延迟)模式。 **理由**: - 某些应用需要强一致性(金融、计费) - 其他应用可以容忍重复以获得更低延迟(日志、指标) - 让用户根据需求选择 **配置**: ```hocon env { checkpoint.mode = "EXACTLY_ONCE" # 或 "AT_LEAST_ONCE" checkpoint.interval = 60000 # 毫秒 } ``` ## 4. 从 V1 到 V2 的演进 ### 4.1 V1 的局限性 SeaTunnel V1(2.3.0 之前)存在重大架构局限性: 1. **引擎特定连接器**:Spark 和 Flink 的单独实现 2. **无统一 API**:无抽象层,与引擎紧密耦合 3. **有限的容错**:完全依赖引擎检查点 4. **无模式管理**:模式隐式,无演化支持 5. **仅单表**:不支持多表同步 ### 4.2 V2 改进 SeaTunnel V2(2.3.0+)重新设计了架构: | 方面 | V1 | V2 | |-----|----|----| | **API** | 引擎特定 | 统一的 SeaTunnel API | | **连接器** | 重复代码 | 单一实现 | | **容错** | 依赖引擎 | 显式检查点协议 | | **模式** | 隐式 | 显式 CatalogTable | | **多表** | 不支持 | 原生支持 | | **引擎支持** | Spark、Flink | Spark、Flink、Zeta | | **精确一次** | 部分 | 端到端 2PC | ### 4.3 迁移路径 V1 和 V2 连接器共存但使用不同的 API: - V1 连接器:`seatunnel-connectors/`(已弃用) - V2 连接器:`seatunnel-connectors-v2/`(推荐) V2 是未来;V1 处于维护模式。 ## 5. 关键设计决策 ### 5.1 为什么分离枚举器和读取器? **替代方案**:单个组件同时处理分片生成和读取。 **决策**:分离组件。 **理由**: - 分片生成是协调逻辑(应在主节点上运行) - 数据读取是执行逻辑(应在工作节点上运行) - 一方的失败不应影响另一方 - 允许在不重启读取器的情况下重新分配分片 ### 5.2 为什么三级数据 Sink 提交(写入器 → 提交器 → 聚合提交器)? **替代方案**:两级(写入器 → 提交器)或直接写入器提交。 **决策**:可选的三级提交。 **理由**: - **写入器**:并行、有状态、每个任务 - **提交器**:并行、无状态、聚合每个写入器的提交 - **聚合提交器**:单线程、有状态、全局协调器 许多数据 Sink 只需要写入器 + 提交器;聚合提交器用于复杂情况(例如,需要单一全局操作的 Hive 表提交)。 ### 5.3 为什么 LogicalDag → PhysicalPlan 分离? **替代方案**:直接从配置生成物理执行计划。 **决策**:两阶段规划。 **理由**: - LogicalDag 表示用户意图(可移植、引擎独立) - PhysicalPlan 表示执行策略(引擎特定、优化) - 分离实现: - 跨引擎可移植性(相同的 LogicalDag,不同的 PhysicalPlan) - 优化传递(融合、分片重新分配) - 测试(单独验证逻辑计划) ### 5.4 为什么基于管道的执行? **替代方案**:单一全局任务图。 **决策**:作业划分为管道。 **理由**: - 每个管道独立的检查点协调 - 更清晰的失败边界 - 更容易推理数据流 - 支持复杂的 DAG(多个数据源/Sink ) ### 5.5 为什么不使用引擎原生检查点? **替代方案**:完全依赖 Flink/Spark 检查点机制。 **决策**:显式 SeaTunnel 检查点协议。 **理由**: - 引擎独立性 - 需要跨引擎的一致语义 - Zeta 引擎否则将没有检查点 - 更多对精确一次语义的控制 - 统一的监控和可观测性 但是,对于 Flink 转换,SeaTunnel 检查点与 Flink 检查点对齐以避免重复。 ## 6. 经验教训 ### 6.1 成功之处 1. **引擎独立性**:通过成功添加 Zeta 引擎而无需 API 更改得到验证 2. **基于分片的并行度**:扩展到 1000+ 并行任务 3. **显式模式**:尽早捕获许多错误,实现模式演化 4. **两阶段提交**:可靠的精确一次语义 ### 6.2 可以改进之处 1. **API 复杂性**:枚举器/提交器增加了简单连接器的学习曲线 2. **类加载器问题**:遮蔽依赖偶尔冲突 3. **检查点延迟**:大状态导致检查点延迟 4. **文档差距**:架构文档落后于代码 ### 6.3 如果重新开始 1. **简化 API**:为简单的数据源/Sink 提供更高级的抽象 2. **异步 I/O 支持**:非阻塞连接器的一等异步 API 3. **内置指标**:API 中的标准化指标收集 4. **模式注册表集成**:与外部模式注册表更紧密的集成 ## 7. 结论 SeaTunnel 的架构反映了竞争关注点之间的仔细权衡: - 引擎独立性 vs 引擎特定优化 - 简单性 vs 灵活性 - 一致性 vs 延迟 - 通用性 vs 专业化 V2 重新设计解决了 V1 的主要局限性,同时建立了长期演进的原则。理解这些设计理念有助于贡献者做出一致的决策,并帮助用户了解 SeaTunnel 的优势和适用场景。 ## 8. 参考资料 - [架构概览](overview.md) - [数据 Source 架构](api-design/source-architecture.md) - [数据 Sink 架构](api-design/sink-architecture.md) - [检查点机制](fault-tolerance/checkpoint-mechanism.md) ### 学术论文 - Chandy-Lamport:["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Flink:["Apache Flink: Stream and Batch Processing in a Single Engine"](https://asterios.katsifodimos.com/assets/publications/flink-deb.pdf) ================================================ FILE: docs/zh/architecture/engine/dag-execution.md ================================================ --- sidebar_position: 2 title: DAG 执行模型 --- # DAG 执行模型 ## 1. 概述 ### 1.1 问题背景 分布式数据处理需要将用户意图转换为可执行的分布式任务: - **抽象层次**: 如何分离逻辑意图与物理执行? - **优化**: 如何优化任务放置和数据混洗? - **流水线**: 如何执行具有多个数据 Source/Sink 的复杂 DAG? - **并行度**: 如何确定任务并行度和分布? - **故障隔离**: 如何将故障影响限制在受影响的组件内? ### 1.2 设计目标 SeaTunnel 的 DAG 执行模型旨在: 1. **关注点分离**: 逻辑规划(用户意图) vs 物理执行(运行时细节) 2. **支持优化**: 任务融合、流水线分割、资源分配 3. **支持复杂拓扑**: 多个数据源、目标端、分支、连接 4. **促进容错**: 清晰的故障边界与独立检查点 5. **最大化并行度**: 高效并行执行,最少协调开销 ### 1.3 执行模型概览 ``` 用户配置 (HOCON) │ ▼ ┌─────────────────────┐ │ LogicalDag │ 逻辑计划 (做什么) │ • LogicalVertex │ - 数据 Source/tranform 转换器/Sink 目标端动作 │ • LogicalEdge │ - 数据依赖关系 │ • Parallelism │ - 逻辑并行度 └─────────────────────┘ │ (计划生成) ▼ ┌─────────────────────┐ │ PhysicalPlan │ 物理计划 (如何执行) │ • SubPlan[] │ - 多个流水线 │ • Resources │ - 资源需求 │ • Scheduling │ - 部署策略 └─────────────────────┘ │ (流水线分割) ▼ ┌─────────────────────┐ │ SubPlan (Pipeline) │ 独立执行单元 │ • PhysicalVertex[] │ - 并行任务实例 │ • CheckpointCoord │ - 独立检查点 │ • PipelineLocation │ - 唯一标识符 └─────────────────────┘ │ (任务部署) ▼ ┌─────────────────────┐ │ PhysicalVertex │ 已部署任务组 │ • TaskGroup │ - 共址任务(融合) │ • SlotProfile │ - 分配的资源槽位 │ • ExecutionState │ - 运行状态 └─────────────────────┘ │ (执行) ▼ ┌─────────────────────┐ │ SeaTunnelTask │ 实际执行 │ • Source/Transform │ - 数据处理 │ • /Sink Logic │ - 状态管理 └─────────────────────┘ ``` ## 2. LogicalDag: 用户意图 ### 2.1 结构 LogicalDag 以引擎无关的方式表示用户的作业配置。 LogicalDag 的核心组成: - **logicalVertexMap**: 顶点集合(每个顶点对应一个 Source/Transform/Sink 动作) - **edges**: 边集合(描述数据流依赖关系) - **jobConfig**: 作业级配置(例如并行度默认值、容错/资源/运行参数) ### 2.2 LogicalVertex 表示单个动作(数据 Source/转换器/Sink 目标端)及其并行度。 一个 LogicalVertex 通常包含: - **vertexId**: 顶点唯一标识 - **action**: 动作类型(SourceAction / TransformChainAction / SinkAction) - **parallelism**: 并行实例数量(若未显式配置,可能由引擎推断) **动作类型**: - **SourceAction**: 封装 `SeaTunnelSource`,生产 `CatalogTable` - **TransformChainAction**: `SeaTunnelTransform` 链,转换模式 - **SinkAction**: 封装 `SeaTunnelSink`,消费 `CatalogTable` **示例**: 来自配置的直观映射关系: - Vertex 1: JDBC Source,parallelism=4 - Vertex 2: SQL Transform,parallelism=8 - Vertex 3: Elasticsearch Sink,parallelism=2 ### 2.3 LogicalEdge 表示动作之间的数据流。 一条 LogicalEdge 通常只需要描述: - **inputVertexId**: 上游顶点 - **targetVertexId**: 下游顶点 **示例**: 典型线性拓扑中的边: - JDBC Source(1) → SQL Transform(2) - SQL Transform(2) → Elasticsearch Sink(3) ### 2.4 LogicalDag 创建 从用户配置构建: LogicalDag 在作业提交/启动阶段由作业执行环境解析配置生成(可能发生在客户端或服务端),随后作为作业不可变信息的一部分交由 JobMaster 管理执行。 **过程**: 1. 解析 HOCON 配置(source、transform、sink 部分) 2. 为每个配置的组件创建 `Action` 对象 3. 从配置结构推断数据流 4. 验证模式兼容性 5. 构建 `LogicalDag` 对象 **示例配置 → LogicalDag**: ```hocon env { parallelism = 4 } source { JDBC { url = "jdbc:mysql://..." query = "SELECT * FROM orders" } } transform { Sql { query = "SELECT order_id, SUM(amount) FROM this GROUP BY order_id" } } sink { Elasticsearch { hosts = ["es-host:9200"] index = "orders_summary" } } ``` 生成的 LogicalDag: ``` Vertex 1 (JDBC 数据源, parallelism=4) │ ▼ Vertex 2 (SQL 转换器, parallelism=4) │ ▼ Vertex 3 (Elasticsearch 目标端, parallelism=4) ``` ## 3. PhysicalPlan: 执行策略 ### 3.1 结构 PhysicalPlan 描述如何在分布式工作节点上执行 LogicalDag。 PhysicalPlan 的核心信息通常包括: - **pipelineList(SubPlans)**: 由 LogicalDag 切分得到的多个流水线(独立执行单元) - **jobImmutableInformation**: 作业不可变信息(例如作业 ID、提交参数、依赖等) - **running state store**: 分布式状态存储(用于运行态状态、时间戳、元信息等) - **jobEndFuture**: 作业完成信号(用于协调退出、回收资源、返回结果) ### 3.2 流水线分割 LogicalDag 在生成 ExecutionPlan 时会被组织为一个或多个**流水线**(Pipeline/SubPlan)。以当前实现为准,主要规则是: 1. **按连通性拆分**:DAG 中互不相连的子图会被拆成不同流水线。 2. **遇到多输入顶点时拆分**:当存在“多输入顶点”(某个顶点有多个上游输入,例如 UNION多流汇聚)时,当前实现会沿每条 source→…→sink 的路径拆成多条线性流水线,并对共享顶点做克隆,以降低多输入拓扑在同一流水线内的协调复杂度。 说明: - 如果仅存在“一个 source 分叉到多个 sink”(多输出/分支),但没有任何多输入顶点,当前实现通常不会仅因为多个 sink 就拆分流水线;该分支拓扑仍可能在同一流水线内执行。 - 更细粒度的切分(例如按并行度/可协调能力)在代码中仍保留 TODO,后续可能演进。 **示例 1: 简单线性流水线**: ```hocon source { JDBC { } } transform { Sql { } } sink { Elasticsearch { } } ``` 生成: **1 个流水线** ``` 流水线 1: [JDBC 数据源] → [SQL 转换器] → [Elasticsearch 目标端] ``` **示例 2: 多个数据源**: ```hocon source { JDBC { plugin_output = "orders" } Kafka { plugin_output = "events" } } transform { Sql { query = "SELECT * FROM orders UNION SELECT * FROM events" } } sink { Elasticsearch { } } ``` 生成: **2 个流水线** ``` 流水线 1: [JDBC 数据源] → [SQL 转换器] → [Elasticsearch 目标端] 流水线 2: [Kafka 数据源] → [SQL 转换器] → [Elasticsearch 目标端] ``` **示例 3: 多个目标端**: ```hocon source { MySQL-CDC { } } sink { Elasticsearch { plugin_input = "MySQL-CDC" } JDBC { plugin_input = "MySQL-CDC" } } ``` 生成: **通常为 1 个流水线(包含分支)** ``` 流水线 1: [MySQL-CDC 数据源] → [Elasticsearch 目标端] └──────→ [JDBC 目标端] ``` ### 3.3 PhysicalPlan 生成 PhysicalPlan 通常由 JobMaster 在拿到 LogicalDag 后生成,并结合 ResourceManager 做资源申请与放置。 **步骤**: 1. **分析 LogicalDag**: 识别数据源、目标端和依赖关系 2. **分割为流水线**: 为每个流水线创建 SubPlan 3. **生成 PhysicalVertices**: 为每个动作创建并行实例 4. **分配资源**: 从 ResourceManager 请求槽位 5. **分配任务**: 将 PhysicalVertices 映射到槽位 6. **创建协调器**: 为每个流水线设置 CheckpointCoordinator ## 4. SubPlan (流水线) ### 4.1 结构 SubPlan 表示一个独立执行的流水线。 SubPlan(流水线)通常包含: - **pipelineId/pipelineLocation**: 流水线的唯一标识 - **physicalVertexList**: 此流水线中的并行任务实例列表 - **coordinatorVertexList**: 协调器类任务(如 split enumerator、聚合提交等单实例协调任务) - **checkpointCoordinator**: 本流水线的检查点协调器(独立协调域) - **pipelineStatus**: 执行状态(如 CREATED/RUNNING/FAILED/FINISHED) ### 4.2 PhysicalVertex 列表 每个并行度为 N 的 LogicalVertex 生成 N 个 PhysicalVertices。 **示例**: ``` LogicalVertex: JDBC 数据源 (parallelism = 4) ↓ PhysicalVertices: - PhysicalVertex (子任务 0, 槽位 1) - PhysicalVertex (子任务 1, 槽位 2) - PhysicalVertex (子任务 2, 槽位 3) - PhysicalVertex (子任务 3, 槽位 4) ``` ### 4.3 协调器顶点 用于协调任务的特殊顶点: - **SourceSplitEnumerator**: 通常以单实例运行,分配分片给读取器(部署位置由引擎调度决定) - **SinkAggregatedCommitter**: 当 Sink 提供 aggregated committer 时,通常以单实例运行用于全局提交协调(部署位置由引擎调度决定) 说明:`SinkCommitter` 的触发方式取决于引擎实现,并不一定体现为独立的协调器顶点;例如在 SeaTunnel Engine 中,committer 可能在 Sink 任务的 checkpoint 回调中被触发。 **示例**: ``` JDBC → Transform → Elasticsearch 的 SubPlan: physicalVertexList: - JdbcSourceTask (4 个实例) - TransformTask (4 个实例) - ElasticsearchSinkTask (4 个实例) coordinatorVertexList: - JdbcSourceSplitEnumerator (1 个实例) - ElasticsearchSinkAggregatedCommitter (1 个实例,可选) ``` ### 4.4 独立检查点 每个流水线都有自己的 `CheckpointCoordinator`: **优势**: - 独立的检查点间隔 - 隔离的故障域 - 减少协调开销 - 简化屏障对齐 **示例**: ``` 流水线 1 (JDBC → ES): CheckpointCoordinator 按作业配置的间隔触发 仅管理 JDBC 和 ES 任务的检查点 流水线 2 (Kafka → JDBC): CheckpointCoordinator 按作业配置的间隔触发 仅管理 Kafka 和 JDBC 任务的检查点 ``` ## 5. PhysicalVertex: 已部署任务 ### 5.1 结构 PhysicalVertex 表示已部署的任务实例。 PhysicalVertex 关注“一个并行任务实例如何被部署与运行”: - **taskGroupLocation**: 任务实例定位信息(含并行子任务序号等) - **taskGroup**: 任务融合后的执行单元(见下节) - **slotProfile**: 该实例被分配到的槽位(资源容量与位置) - **currentExecutionState**: 当前执行状态(CREATED/RUNNING/FAILED 等) - **pluginJarsUrls**: 插件依赖(用于类加载隔离) ### 5.2 TaskGroup: 任务融合 多个任务可以融合到单个 `TaskGroup` 以提高效率。 TaskGroup 的关键点: - 将一段可融合的线性算子链(Source/Transform/Sink 的某些组合)放在同一执行单元内 - 通过共享线程/队列/内存通道减少跨算子序列化与网络开销 - 以并行度为单位生成多个 TaskGroup 实例(通常与上游并行度对齐) **融合条件**: 1. 相同并行度 2. 顺序依赖(A → B) 3. 不需要数据混洗 **示例(带融合)**: ``` LogicalDag: Source (parallelism=4) → Transform (parallelism=4) → Sink (parallelism=4) 不融合: 12 个独立任务(4 + 4 + 4) Source → Transform 和 Transform → Sink 有网络开销 融合后: 4 个 TaskGroups,每个包含: [SourceTask → TransformTask → SinkTask] (单线程,共享内存) ``` **优势**: - 减少网络序列化/反序列化 - 更好的 CPU 缓存局部性 - 更低的内存占用 - 简化部署 ### 5.3 槽位分配 每个 PhysicalVertex 被分配一个 `SlotProfile`: SlotProfile 表达“这个任务实例运行在哪里、能用多少资源”。具体字段与语义见资源管理文档。 **分配过程**: 1. JobMaster 从 ResourceManager 请求槽位 2. ResourceManager 根据分配策略选择工作节点(例如 RANDOM / SLOT_RATIO / SYSTEM_LOAD) 3. ResourceManager 分配槽位并返回 SlotProfiles 4. JobMaster 将 SlotProfiles 分配给 PhysicalVertices 5. JobMaster 通过 `DeployTaskOperation` 部署任务 ## 6. 任务部署和执行 ### 6.1 部署流程 ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as Worker Node participant Task as SeaTunnelTask JM->>JM: Generate PhysicalPlan JM->>RM: applyResources(resourceProfiles) RM->>RM: Allocate slots RM-->>JM: Return SlotProfiles JM->>JM: Assign slots to PhysicalVertices loop For each PhysicalVertex JM->>Worker: DeployTaskOperation(taskGroup) Worker->>Task: Create SeaTunnelTask Task->>Task: INIT → WAITING_RESTORE Task->>JM: Report ready end JM->>Worker: Start execution Worker->>Task: READY_START → STARTING → RUNNING ``` ### 6.2 任务执行 每个 `SeaTunnelTask` 执行其分配的动作: **SourceSeaTunnelTask**: 执行要点: - 持续从 SourceReader 拉取/接收数据并发出记录 - 在检查点触发时生成并传播 barrier(屏障),参与流水线级的一致性快照 **TransformSeaTunnelTask**: 执行要点: - 从上游通道读取记录 - 应用 transform 逻辑并输出到下游通道 - 若 transform 有状态,需要参与 checkpoint 的状态快照与恢复 **SinkSeaTunnelTask**: 执行要点: - 持续消费上游记录并调用 sinkWriter 写入目标端 - 在 barrier 到达时切换到“快照边界”:准备提交信息(prepareCommit(checkpointId))、持久化 writer 状态并将提交信息交给 committer - 在 checkpoint 成功后由 committer 进行最终提交;失败时由恢复流程回滚/重试(取决于 sink 语义) ## 7. 优化策略 ### 7.1 任务融合 **何时融合**: - 相同并行度 - 顺序算子(无分支) - 无混洗边界 **何时不融合**: - 不同并行度(例如 source=4, sink=8) - 分支 DAG(一个数据源,多个目标端) - 需要混洗(例如 GROUP BY、JOIN) 说明:任务融合的具体策略与可配置项以当前引擎实现为准,文档不在此绑定某个固定的配置开关,避免与实际版本不一致。 ### 7.2 并行度推断 并行度以配置为准: - 若连接器显式配置了 `parallelism`,则使用连接器配置。 - 否则使用 `env.parallelism`(默认值为 1)。 - 某些连接器/引擎可能会根据外部系统分区数等信息做额外推断,但这是实现细节,不能在架构文档里写成固定规则。 **示例**: ```hocon source { JDBC { parallelism = 4 } # 显式 } transform { Sql { } # 推断: 4 (来自数据源) } sink { Elasticsearch { } # 推断: 4 (来自转换器) } ``` ### 7.3 资源分配 **槽位计算**: ``` 所需槽位 = 所有任务并行度之和 示例: Source (parallelism=4) + Transform (parallelism=4) + Sink (parallelism=2) = 需要 10 个槽位 融合后: TaskGroup (parallelism=4, fusion[Source+Transform]) + Sink (parallelism=2) = 需要 6 个槽位 ``` 说明:资源画像/槽位资源的具体字段、单位与配置路径以引擎侧配置与实现为准;文档不在此给出不存在或不稳定的配置项示例。 ## 8. 故障处理 ### 8.1 任务故障 **检测**: - 任务抛出异常 - 心跳超时 **恢复**: 1. 标记任务为 FAILED 2. 使整个流水线失败(保守策略) 3. 从最新检查点恢复 4. 重新分配资源 5. 重新部署和重启流水线 ### 8.2 流水线故障隔离 **关键见解**: 流水线故障是隔离的。 **示例**: ``` 有 2 个流水线的作业: 流水线 1: JDBC → ES (RUNNING) 流水线 2: Kafka → JDBC (FAILED) 结果: 流水线 2 从检查点重启 流水线 1 继续不受影响 ``` **优势**: - 减少爆炸半径 - 更快恢复(仅失败的流水线) - 更好的资源利用率 ## 9. 监控和可观测性 ### 9.1 关键指标 **流水线级别**: - `pipeline.status`: CREATED / RUNNING / FINISHED / FAILED - `pipeline.tasks.total`: 任务总数 - `pipeline.tasks.running`: 当前运行的任务数 - `pipeline.checkpoint.latest_id`: 最新检查点 ID - `pipeline.checkpoint.duration`: 检查点持续时间 **任务级别**: - `task.status`: 任务执行状态 - `task.records_in`: 接收的记录数 - `task.records_out`: 发出的记录数 - `task.bytes_in`: 接收的字节数 - `task.bytes_out`: 发出的字节数 ### 9.2 可视化 ``` 作业: mysql-to-es │ ├── 流水线 1 (mysql-cdc → elasticsearch) │ ├── PhysicalVertex 0 [RUNNING] @ worker-1:slot-1 │ ├── PhysicalVertex 1 [RUNNING] @ worker-2:slot-1 │ ├── PhysicalVertex 2 [RUNNING] @ worker-3:slot-1 │ └── PhysicalVertex 3 [RUNNING] @ worker-4:slot-1 │ └── 流水线 2 (mysql-cdc → jdbc) ├── PhysicalVertex 0 [RUNNING] @ worker-1:slot-2 └── PhysicalVertex 1 [RUNNING] @ worker-2:slot-2 ``` ## 10. 最佳实践 ### 10.1 并行度配置 **经验法则**: ``` 并行度 = min( 数据分区数, 可用槽位数, 目标吞吐量 / 单任务吞吐量 ) ``` **示例**: - **JDBC 数据源**: 设置为数据库分区数(例如 8 个分区 → parallelism=8) - **Kafka 数据源**: 设置为分区数(例如 32 个分区 → parallelism=32) - **文件数据源**: 设置为文件数或文件分片数 - **CPU 密集型转换器**: 设置为 CPU 核心数 - **I/O 密集型目标端**: 根据目标系统容量设置 ### 10.2 流水线设计 **保持流水线简单**: - 优先使用线性流水线(数据源 → 转换器 → 目标端) - 尽可能避免复杂分支 - 对完全独立的工作流使用多个作业 **何时使用多个作业**: - 需要不同的检查点间隔 - 需要不同的资源需求 - 需要独立的故障域 ### 10.3 故障排除 **问题**: 任务未启动 **检查**: 1. 是否有足够的可用槽位?(`required_slots <= available_slots`) 2. 资源配置文件是否合理?(不要请求 100 个 CPU 核心) 3. 标签过滤器是否正确?(如果使用基于标签的分配) **问题**: 低吞吐量 **检查**: 1. 并行度是否太低?(增加并行度) 2. 任务融合是否被禁用?(启用以获得更好的性能) 3. 检查点间隔是否太短?(增加间隔) ## 11. 相关资源 - [引擎架构](engine-architecture.md) - [资源管理](resource-management.md) - [检查点机制](../fault-tolerance/checkpoint-mechanism.md) - [架构概述](../overview.md) ## 12. 参考资料 ### 进一步阅读 - [Google Borg Paper](https://research.google/pubs/pub43438/) - 任务调度灵感 - [Apache Flink JobGraph](https://nightlies.apache.org/flink/flink-docs-stable/docs/internals/job_scheduling/) - [Spark DAG Scheduler](https://spark.apache.org/docs/latest/job-scheduling.html) ================================================ FILE: docs/zh/architecture/engine/engine-architecture.md ================================================ --- sidebar_position: 1 title: 引擎架构 --- # SeaTunnel 引擎(Zeta)架构 ## 1. 概述 ### 1.1 问题背景 数据集成引擎必须解决基本的分布式系统挑战: - **分布式执行**:如何跨多台机器执行作业? - **资源管理**:如何高效地分配和调度任务? - **容错**:如何从工作节点/主节点失败中恢复? - **协调**:如何同步分布式任务(检查点、提交)? - **可扩展性**:如何处理不断增加的工作负载? ### 1.2 设计目标 SeaTunnel 引擎(Zeta)设计为原生执行引擎,具有: 1. **轻量级**:最小依赖、快速启动、低资源开销 2. **高性能**:针对数据同步工作负载优化 3. **容错**:基于检查点的恢复与精确一次语义 4. **资源效率**:基于槽位的资源管理与细粒度控制 5. **引擎独立性**:支持与 Flink/Spark 转换相同的连接器 API ### 1.3 架构对比 | 特性 | SeaTunnel Zeta | Apache Flink | Apache Spark | |---------|---------------|--------------|--------------| | **主要用例** | 数据同步、CDC | 流处理 | 批处理 + ML | | **资源模型** | 基于槽位 | 基于槽位 | 基于执行器 | | **状态后端** | 可插拔(例如 localfile/hdfs 等,取决于配置与插件) | RocksDB/堆 | 内存/磁盘 | | **检查点** | 分布式快照 | Chandy-Lamport | RDD 血统 | | **启动时间** | 取决于部署与依赖 | 取决于部署与依赖 | 取决于部署与依赖 | | **依赖** | 取决于打包与插件 | 取决于打包与插件 | 取决于打包与插件 | ## 2. 整体架构 ### 2.1 主-工架构 ``` ┌─────────────────────────────────────────────────────────────────┐ │ 主节点 │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ CoordinatorService │ │ │ │ • 管理所有运行中的作业 │ │ │ │ • 作业提交和生命周期管理 │ │ │ │ • 维护作业状态(IMap) │ │ │ │ • 资源管理器工厂 │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ JobMaster(每个作业一个) │ │ │ │ • 生成物理执行计划 │ │ │ │ • 从 ResourceManager 请求资源 │ │ │ │ • 将任务部署到工作节点 │ │ │ │ • 协调检查点 │ │ │ │ • 处理故障转移和恢复 │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ │ │ (任务部署) │ (资源请求) │ │ ▼ ▼ │ │ ┌─────────────────┐ ┌────────────────────────────┐ │ │ │ CheckpointManager│ │ ResourceManager │ │ │ │ (每个管道) │ │ • 槽位分配 │ │ │ └─────────────────┘ │ • 工作节点注册 │ │ │ │ • 负载均衡 │ │ │ └────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ │ (Hazelcast 集群) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ 工作节点 │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ TaskExecutionService │ │ │ │ • 部署和执行任务 │ │ │ │ • 管理任务生命周期 │ │ │ │ • 报告心跳 │ │ │ │ • 槽位资源管理 │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ ▼ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ SeaTunnelTask(每个工作节点多个) │ │ │ │ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ SourceFlowLifeCycle │ │ │ │ │ │ • SourceReader │ │ │ │ │ │ • SeaTunnelSourceCollector │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ │ │ │ │ │ │ ▼ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ TransformFlowLifeCycle │ │ │ │ │ │ • 转换链 │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ │ │ │ │ │ │ ▼ │ │ │ │ ┌─────────────────────────────────────────────┐ │ │ │ │ │ SinkFlowLifeCycle │ │ │ │ │ │ • SinkWriter │ │ │ │ │ └─────────────────────────────────────────────┘ │ │ │ └───────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ ``` ### 2.2 核心组件 #### CoordinatorService 管理集群中所有作业的中心化服务。 **职责**: - 接受作业提交 - 为每个作业创建 JobMaster - 在分布式 IMap 中维护作业状态 - 提供作业查询和管理 API - 处理作业生命周期事件 **关键数据结构**: - 运行中作业元信息:作业基本信息、当前状态、状态变更时间戳(分布式存储,支持多节点一致读取) - 已完成作业历史:用于查询与审计的作业快照(通常包含最终状态与关键元数据) #### JobMaster 管理单个作业执行生命周期。 **职责**: - 解析配置 → 生成 LogicalDag - 从 LogicalDag 生成 PhysicalPlan - 从 ResourceManager 请求资源(槽位) - 将任务部署到工作节点 - 协调管道检查点 - 处理任务失败并重新调度 **生命周期**: ``` Created → Initialized → Scheduled → Running → Finished/Failed/Canceled ``` **关键操作**: 1. `init()`:生成物理计划,创建检查点协调器 2. `run()`:请求资源,部署任务,启动执行 3. `handleFailure()`:重启失败的任务,从检查点恢复 #### ResourceManager 管理工作节点资源和槽位分配。 **职责**: - 跟踪工作节点注册和心跳 - 维护工作节点资源配置(CPU、内存) - 基于策略分配槽位(随机、槽位比率、基于负载) - 任务完成后释放槽位 - 处理工作节点失败 **槽位分配策略**: - Random:在可用工作节点中随机选择 - SlotRatio:优先选择拥有更多可用槽位的工作节点 - SystemLoad:优先选择 CPU/内存使用率较低的工作节点 ## 3. DAG 执行模型 ### 3.1 执行计划转换 ``` 用户配置(HOCON) │ ▼ ┌───────────────┐ │ LogicalDag │ • 逻辑顶点(数据源/转换/数据 Sink ) │ │ • 逻辑边(数据流) │ │ • 并行度(每个顶点) └───────────────┘ │ (JobMaster.generatePhysicalPlan()) ▼ ┌───────────────┐ │ PhysicalPlan │ • SubPlan 列表(管道) │ │ • JobImmutableInformation │ │ • 资源要求 └───────────────┘ │ ▼ ┌───────────────┐ │ SubPlan │ • 管道(独立执行单元) │ (Pipeline) │ • PhysicalVertex 列表 │ │ • CheckpointCoordinator └───────────────┘ │ ▼ ┌───────────────┐ │PhysicalVertex │ • TaskGroup(共存任务) │ │ • 分配的 SlotProfile │ │ • ExecutionState └───────────────┘ │ ▼ ┌───────────────┐ │ TaskGroup │ • 多个 SeaTunnelTask 实例 │ │ • 共享网络缓冲区 │ │ • 线程池 └───────────────┘ │ ▼ ┌───────────────┐ │ SeaTunnelTask │ • 单个任务执行 │ │ • 数据源/转换/数据 Sink 生命周期 │ │ • 任务状态机 └───────────────┘ ``` ### 3.2 LogicalDag 以引擎独立的方式表示用户意图。 **核心元素(概念级)**: - LogicalVertex:一个逻辑算子节点(Source / TransformChain / Sink),包含并行度等执行提示 - LogicalEdge:逻辑边,描述上游到下游的数据流向 - JobConfig:作业级配置(并行度、容错、资源、插件等) **创建**: 由 `JobConfig`/用户配置构建:解析配置 → 生成顶点/边 → 生成可执行提示(并行度、资源等)。 ### 3.3 PhysicalPlan 表示带资源分配的实际执行计划。 **核心结构(概念级)**: - PhysicalPlan:由多个 `SubPlan`(管道)组成,并携带作业不可变元信息与终态结果句柄 - SubPlan(Pipeline):一个独立执行单元,包含本管道的任务顶点集合,以及本管道的 checkpoint 协调器 - PhysicalVertex:一个可调度的并行实例,绑定到具体槽位/工作节点,并维护自身执行状态 **生成**: 由 JobMaster 完成: 1. 将 LogicalDag 切分为管道 2. 为每个顶点生成并行实例(PhysicalVertex)并计算资源需求 3. 为每个管道创建独立的 checkpoint 协调器 ### 3.4 管道执行 作业被划分为**管道**(SubPlan)以便独立执行: **示例**: ```hocon # 多数据源/Sink 配置 env { ... } source { MySQL-CDC { table = "orders" } Kafka { topic = "events" } } transform { Sql { query = "SELECT * FROM orders JOIN events ON ..." } } sink { Elasticsearch { index = "orders" } JDBC { table = "events" } } ``` **生成的管道**: ``` 管道 1: MySQL-CDC → 转换 → Elasticsearch 管道 2: Kafka → 转换 → JDBC ``` **好处**: - 独立的检查点协调 - 隔离的失败域 - 并行管道执行 ### 3.5 任务融合 多个操作可以融合到单个 TaskGroup 中以提高效率: ``` 无融合: [数据源任务] → 网络 → [转换任务] → 网络 → [数据 Sink 任务] 有融合: [TaskGroup: 数据源 → 转换 → 数据 Sink ](单线程,无网络) ``` **融合条件**: - 相同的并行度 - 顺序依赖 - 不需要 shuffle ## 4. 任务生命周期 ### 4.1 任务状态机 ``` [Created] │ ▼ [INIT] ────────────────────────────────────┐ │ │ ▼ │ [WAITING_RESTORE](如果恢复中) │ │ │ ▼ │ [READY_START] │ │ │ ▼ │ [STARTING] ──────────────┐ │ │ │ │ ▼ ▼ ▼ [RUNNING] ──────────> [FAILED] ─────> (重启) │ ▼ [PREPARE_CLOSE] │ ▼ [CLOSED] │ ▼ [CANCELED](如果作业取消) ``` **状态转换**: 1. **CREATED → INIT**:任务已创建,初始化资源 2. **INIT → WAITING_RESTORE**:从检查点恢复 3. **WAITING_RESTORE → READY_START**:状态已恢复 4. **READY_START → STARTING**:打开数据源/转换/数据 Sink 5. **STARTING → RUNNING**:数据处理已启动 6. **RUNNING → PREPARE_CLOSE**:正常完成 7. **PREPARE_CLOSE → CLOSED**:资源已清理 8. **RUNNING → FAILED**:发生异常 ### 4.2 SeaTunnelTask 执行 **执行骨架(语义级)**: 1. `init`:初始化运行时资源 2. `restoreState`:如果处于恢复路径,加载 checkpoint 状态 3. `open`:打开 Source/Transform/Sink 生命周期 4. 主循环:处理数据 + 处理 checkpoint 屏障/控制消息 5. `close`:正常结束时清理资源;异常时进入失败处理与上报 **任务类型**: - **SourceSeaTunnelTask**:运行 SourceReader,发送数据 - **SinkSeaTunnelTask**:运行 SinkWriter,消费数据 - **TransformSeaTunnelTask**:运行转换链 ### 4.3 FlowLifeCycle 管理 每个任务通过 FlowLifeCycle 管理组件生命周期: **生命周期语义**: - `open`:初始化 reader/transform chain/writer 等组件 - `collect`:数据驱动的执行入口(source poll、transform 处理、sink write) - `close`:释放资源并保证幂等(可被重复调用) ## 5. 检查点协调 ### 5.1 CheckpointCoordinator(每个管道) 每个管道都有独立的检查点协调器。 **职责**: - 定期触发检查点 - 将检查点屏障注入数据流 - 收集任务确认 - 持久化完成的检查点 - 清理旧检查点 **关键数据结构**: - checkpointId 生成器:单调递增生成 checkpointId - pendingCheckpoints:进行中的 checkpoint 集合(等待 task ACK) - completed checkpoints:最近成功的 checkpoint 列表(用于恢复与保留策略) - checkpointStorage:外部持久化后端 **检查点流程**: 1. 协调器触发检查点(定期或手动) 2. 向管道中所有数据源任务发送屏障 3. 屏障通过数据流传播 4. 每个任务在收到屏障时快照状态 5. 任务向协调器发送 ACK 6. 协调器等待所有 ACK 7. 创建 CompletedCheckpoint,持久化到存储 ### 5.2 检查点屏障 与数据一起流动的特殊控制消息: **屏障字段(概念级)**: - checkpointId:本次 checkpoint 的唯一标识 - timestamp:触发时间 - type:checkpoint/savepoint 等类型标识 **屏障对齐**: - 具有多个输入的任务在快照前等待来自所有输入的屏障 - 确保分布式任务之间的一致性快照 ## 6. 资源管理 ### 6.1 槽位模型 **SlotProfile**: - slotId:槽位标识 - worker:所属工作节点 - resourceProfile:CPU/内存等资源画像 **WorkerProfile**: - address:工作节点地址 - total/available:总资源与可用资源 - assigned/unassigned:已分配与未分配槽位 ### 6.2 资源分配流程 ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as 工作节点 JM->>RM: applyResources(jobId, resourceProfiles) RM->>RM: 选择工作节点(策略) RM->>RM: 分配槽位 RM->>JM: 返回槽位配置 JM->>Worker: 部署任务(DeployTaskOperation) Worker->>Worker: 创建 SeaTunnelTask Worker->>JM: ACK JM->>JM: 任务运行中 ``` ### 6.3 基于标签的槽位过滤 将任务分配到特定工作节点组: ```hocon env { # 作业级 worker 标签过滤(key/value 全量匹配) tag_filter = { zone = "db-zone" } } ``` **用途**: - 数据局部性(分配到靠近数据源的工作节点) - 资源隔离(ML 转换使用 GPU 工作节点) - 多租户(不同团队使用不同的工作节点池) 说明:`tag_filter` 对整个作业/流水线生效;worker 的标签来源于集群成员属性(key/value),由集群部署侧配置与维护。 ## 7. 失败处理 ### 7.1 任务失败 **检测**: - 任务向 JobMaster 报告异常 - JobMaster 监控任务心跳 - 超时触发失败检测 **恢复**: 1. 标记任务为 FAILED 2. 释放任务的槽位 3. 检索最新的成功检查点 4. 使用恢复的状态重启任务 5. 重新分配分片(对于数据源任务) ### 7.2 工作节点失败 **检测**: - ResourceManager 监控工作节点心跳 - Hazelcast 集群检测成员移除 **恢复**: 1. 标记失败工作节点上的所有任务为 FAILED 2. 触发作业故障转移 3. 从最新检查点恢复 4. 在健康的工作节点上重新分配槽位 5. 重新部署任务 ### 7.3 主节点失败 **高可用性**: - 多个主节点(Hazelcast 集群) - 作业状态存储在分布式 IMap 中(已复制) - 新主节点从 IMap 状态接管 **恢复**: 1. 检测主节点失败(Hazelcast) 2. 选举新主节点 3. 新主节点从 IMap 读取作业状态 4. 重新连接到工作节点 5. 恢复检查点协调 ## 8. 设计考量 ### 8.1 为什么基于管道的执行? **替代方案**:单一全局 DAG 执行 **决策**:划分为管道 **好处**: - 独立的检查点协调(较少的协调开销) - 清晰的失败边界(一个管道失败,其他继续) - 更容易推理数据流 - 支持复杂的 DAG(多数据源/Sink ) **缺点**: - 无法跨管道边界融合任务 - 管道之间潜在的数据序列化 ### 8.2 为什么使用 Hazelcast 进行协调? **替代方案**:Zookeeper、etcd、自定义 Raft 实现 **决策**:Hazelcast IMDG **好处**: - 内存分布式数据结构(低延迟) - 内置集群管理和失败检测 - 易于嵌入(无外部依赖) - 熟悉的 API(Java Collections) **缺点**: - 大状态的内存开销 - 作为协调工具,不如 Zookeeper 经过充分测试 ### 8.3 性能优化 **1. 任务融合**: - 减少网络开销 - 改善 CPU 缓存局部性 - 降低序列化成本 **2. 异步检查点**: - 检查点上传不阻塞数据处理 - 跨任务并行检查点 **3. 增量检查点**: - 仅上传更改的状态(未来增强) **4. 零拷贝数据传输**: - 共存任务之间的共享内存 - 避免不必要的序列化 ## 9. 相关资源 - [架构概览](../overview.md) - [设计理念](../design-philosophy.md) - [检查点机制](../fault-tolerance/checkpoint-mechanism.md) - [资源管理](resource-management.md) - [DAG 执行](dag-execution.md) ## 10. 参考资料 ### 进一步阅读 - [Hazelcast IMDG](https://docs.hazelcast.com/imdg/latest/) - [Google Borg 论文](https://research.google/pubs/pub43438/) - 资源管理的灵感来源 - [Apache Flink 架构](https://flink.apache.org/flink-architecture.html) ================================================ FILE: docs/zh/architecture/engine/resource-management.md ================================================ --- sidebar_position: 3 title: 资源管理 --- # 资源管理 ## 1. 概述 ### 1.1 问题背景 分布式执行引擎必须高效管理计算资源: - **资源分配**: 如何公平高效地将任务分配给工作节点? - **负载均衡**: 如何在工作节点之间均匀分布工作负载? - **资源隔离**: 如何防止作业之间的资源争用? - **动态扩缩容**: 如何在不中断作业的情况下添加/删除工作节点? - **异构资源**: 如何处理具有不同能力的工作节点? ### 1.2 设计目标 SeaTunnel 的资源管理系统旨在: 1. **细粒度控制**: 基于槽位的分配实现精确资源管理 2. **灵活策略**: 针对不同场景的多种分配策略 3. **基于标签的过滤**: 将任务分配给特定的工作节点组 4. **高可用性**: 容忍工作节点故障并自动重新分配 5. **可观测性**: 实时跟踪资源使用和可用性 ### 1.3 架构概览 ``` ┌──────────────────────────────────────────────────────────────┐ │ JobMaster │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ 请求资源 │ │ │ │ • 计算所需槽位 │ │ │ │ • (可选)表达资源需求(以当前引擎实现为准) │ │ │ │ • 应用标签过滤器(可选) │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ ResourceManager │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ 工作节点注册表 │ │ │ │ • WorkerProfile (每个工作节点) │ │ │ │ - 总资源 │ │ │ │ - 可用资源 │ │ │ │ - 已分配槽位 │ │ │ │ - 未分配槽位 │ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ 分配策略 │ │ │ │ • RandomStrategy / SlotRatioStrategy / SystemLoadStrategy│ │ │ └────────────────────────────────────────────────────┘ │ │ │ │ ┌────────────────────────────────────────────────────┐ │ │ │ 槽位管理 │ │ │ │ • 分配槽位 │ │ │ │ • 释放槽位 │ │ │ │ • 跟踪槽位使用 │ │ │ └────────────────────────────────────────────────────┘ │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ 工作节点 │ │ │ │ Worker 1 Worker 2 Worker N │ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │ │ Slot 1 │ │ Slot 1 │ │ Slot 1 │ │ │ │ Slot 2 │ │ Slot 2 │ │ Slot 2 │ │ │ │ ... │ │ ... │ │ ... │ │ │ └──────────┘ └──────────┘ └──────────┘ │ └──────────────────────────────────────────────────────────────┘ ``` ## 2. 核心概念 ### 2.1 槽位(Slot) **槽位**是资源分配的基本单位。 一个槽位通常由以下信息描述: - **slotID**: 槽位唯一标识 - **worker**: 槽位所在工作节点地址 - **resourceProfile**: 槽位可提供的资源容量(CPU/内存等) **关键属性**: - **粒度化**: 每个槽位可以托管一个或多个任务(任务融合) - **类型化**: 槽位具有资源配置文件(CPU、内存) - **有状态**: 槽位跟踪分配状态(已分配/未分配) **示例**: - slotID = 1001 - worker = worker-1:5801 - resourceProfile = cpu.cores / heapMemory.bytes(字段以引擎实现为准) ### 2.2 ResourceProfile 描述资源需求或容量。 一个资源配置文件(ResourceProfile)通常包括: - **cpu.cores**: CPU 核心数(当前实现为整数 core) - **heap-memory.bytes**: JVM 堆内存(字节) 说明:当前资源调度在很多场景下以“slot 是否可用”为主要约束;ResourceProfile 作为扩展点存在,但是否支持按 CPU/内存精细调度取决于具体版本实现。 **用途**: - **任务需求**: 引擎在申请槽位时携带资源需求(当前实现常为默认/空需求,更多能力视版本而定) - **槽位容量**: 每个槽位公布其可用资源 - **匹配**: ResourceManager 将任务需求与槽位容量匹配 ### 2.3 WorkerProfile 表示工作节点的资源和槽位清单。 工作节点画像(WorkerProfile)通常包含: - **address**: 工作节点地址 - **totalResourceProfile**: 节点总资源(常由槽位资源汇总得到) - **availableResourceProfile**: 当前可用资源 - **assignedSlots/unassignedSlots**: 已分配/未分配槽位清单 - **tags**: 节点标签(用于过滤、隔离、数据局部性) **生命周期**: 1. **注册**: 工作节点启动时向 ResourceManager 注册 2. **心跳**: 工作节点定期发送心跳及更新的资源信息 3. **分配**: ResourceManager 从未分配池中分配槽位 4. **释放**: 完成的任务释放槽位,将其移回未分配池 5. **注销**: 工作节点离开集群(优雅或故障) ## 3. ResourceManager ### 3.1 接口 ResourceManager 对外暴露的关键能力可以概括为: - **applyResources(jobId, resourceProfiles, tagFilters)**: 为作业申请一组满足资源需求的槽位;当资源不足时返回失败(例如抛出 NoEnoughResourceException 或以失败的 Future 表达) - **releaseResources(jobId, slots)**: 作业完成/失败后释放槽位,回收至可分配池 - **heartbeat(workerProfile)**: 接收工作节点心跳并更新其资源/槽位信息 - **memberRemoved(event)**: 处理成员移除事件(故障或优雅下线),触发资源回收与作业侧重调度 ### 3.2 实现: AbstractResourceManager 典型实现会维护以下状态与策略: - **registerWorker**: 已注册工作节点到 WorkerProfile 的映射(由心跳持续刷新) - **slotAllocationStrategy**: 选择 worker 的分配策略(随机/比例/系统负载等) - **故障检测**: 结合 worker 心跳上报与 Hazelcast 成员事件判定节点失联(具体阈值以配置/实现为准) 申请资源的关键流程: 1. 根据 tagFilters 过滤候选工作节点 2. 针对每个 ResourceProfile 需求,使用策略选择一个满足容量约束的未分配槽位 3. 将槽位从“未分配池”标记为“已分配”,并同步更新 WorkerProfile 4. 返回分配结果;如任一需求无法满足,则整体失败并由 JobMaster 决定重试/降级 释放资源的关键流程: 1. 将 slots 标记为未分配并回收到可分配池 2. 更新工作节点可用资源与槽位统计 ## 4. 槽位分配策略 ### 4.1 RandomStrategy 随机选择具有可用槽位的工作节点。 核心思路: 1. 过滤出“资源满足 requiredProfile 且存在未分配槽位”的工作节点集合 2. 在集合中随机选择一个工作节点 3. 从该节点的未分配槽位中挑选一个满足容量约束的槽位返回 **优点**: - 简单快速 - 无协调开销 - 适用于同构集群 **缺点**: - 无负载均衡 - 可能造成热点 ### 4.2 SlotRatioStrategy 优先选择可用槽位比率更高的工作节点。 核心思路: 1. 过滤出资源满足 requiredProfile 的工作节点 2. 计算并选择“可用槽位比率 = unassigned / (assigned + unassigned)”最高的节点 3. 从该节点的未分配槽位中选择一个满足容量约束的槽位 **优点**: - 更好的负载均衡 - 均匀分布任务 - 防止工作节点过载 **缺点**: - 计算稍多 - 可能不考虑实际 CPU/内存负载 ### 4.3 SystemLoadStrategy 选择系统负载(CPU/内存使用)最低的工作节点。 核心思路: 1. 基于心跳上报的资源使用情况计算节点负载(例如 CPU/内存利用率的加权) 2. 在满足 requiredProfile 的候选节点中选择负载最低者 3. 从该节点挑选一个满足容量约束的未分配槽位 负载计算的关键在于: - 依赖指标的时效性与稳定性(过旧会导致误判,过抖会导致分配抖动) - 需要明确权重与采样窗口,避免频繁迁移/重分配 **优点**: - 考虑实际资源使用 - 最适合异构集群 - 优化集群利用率 **缺点**: - 需要实时指标 - 计算成本更高 - 如果负载快速变化可能抖动 ## 5. 基于标签的槽位过滤 ### 5.1 用例 **数据局部性**: ```hocon env { # 作业级 worker 标签过滤(key/value 全量匹配) tag_filter = { zone = "us-west-1" } } ``` **资源专业化**: ```hocon env { tag_filter = { resource = "gpu" } } ``` **多租户**: ```hocon env { job.name = "tenant-a-job" tag_filter = { tenant = "a" } } ``` ### 5.2 TagFilter TagFilter 可以视为一个简单的键值匹配条件: - key/value 需要同时匹配工作节点的 attributes(标签由集群部署侧维护) - 多个 TagFilter 之间通常按“与(AND)”组合:任一不匹配则该节点被过滤 **过滤过程**: 过滤过程通常为: 1. 枚举所有已注册工作节点 2. 对每个节点依次校验 filters;全部匹配则保留 3. 得到候选节点集合,交给槽位分配策略继续挑选 ## 6. 资源分配流程 ### 6.1 正常分配 ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager participant Worker as Worker Node JM->>JM: Generate PhysicalPlan JM->>JM: Calculate required resources JM->>RM: applyResources(profiles, tags) RM->>RM: Filter workers by tags RM->>RM: Select workers by strategy RM->>RM: Allocate slots RM-->>JM: Return SlotProfiles JM->>JM: Assign slots to PhysicalVertices loop For each task JM->>Worker: DeployTaskOperation(task, slot) Worker->>Worker: Execute task in slot Worker-->>JM: ACK end ``` ### 6.2 资源不足 ```mermaid sequenceDiagram participant JM as JobMaster participant RM as ResourceManager JM->>RM: applyResources(100 slots) RM->>RM: Check available slots Note over RM: Only 50 slots available RM-->>JM: NoEnoughResourceException JM->>JM: Retry with backoff Note over JM: Wait for resources to free up JM->>RM: applyResources(100 slots) RM-->>JM: Success (after resources freed) ``` ### 6.3 资源释放 ```mermaid sequenceDiagram participant Task as SeaTunnelTask participant JM as JobMaster participant RM as ResourceManager Task->>Task: Task completes/fails Task->>JM: Task finished JM->>RM: releaseResources(slots) RM->>RM: Mark slots as unassigned RM->>RM: Update WorkerProfile Note over RM: Slots available for
    new allocations ``` ## 7. 故障处理 ### 7.1 工作节点故障 **检测**: - worker 心跳/资源上报异常或停止(阈值以配置/实现为准) - Hazelcast 成员移除事件 **恢复**: ResourceManager 侧的典型处理步骤: 1. 从注册表中移除失联/下线的工作节点 2. 识别该节点上“已分配”的槽位集合(即可能承载了正在运行的任务) 3. 将槽位丢失事件通知到对应的 JobMaster(或由 Coordinator 统一转发) 4. 由作业侧触发 failover:标记任务失败、从检查点恢复、重新申请新槽位并重新部署 **JobMaster 响应**: 1. 标记失败槽位上的任务为 FAILED 2. 从最新检查点恢复 3. 从 ResourceManager 请求新槽位 4. 重新部署任务 ### 7.2 ResourceManager 故障 **高可用性**: - ResourceManager 状态是无状态的(工作节点注册表从心跳重建) - 新的 ResourceManager 实例在主节点故障转移时启动 - 工作节点通过心跳机制重新注册 **恢复**: 恢复要点: - ResourceManager 需要能够重新建立“工作节点注册表”:工作节点通过心跳主动上报其 address、资源、槽位与标签 - ResourceManager 需要定期清理超时心跳的节点,避免将任务分配给已失联节点 - 由于注册表可由心跳重建,故障转移后的新实例可以在短时间内恢复资源视图(视心跳间隔与超时参数而定) ## 8. 配置 ### 8.1 槽位配置 ```hocon seatunnel { engine { slot-service { # 是否启用动态槽位 dynamic-slot = true # 固定槽位数(仅在 dynamic-slot = false 时生效) slot-num = 2 } } } ``` ### 8.2 资源策略 ```hocon seatunnel { engine { slot-service { # worker 选择策略(取值需能映射到 AllocateStrategy 枚举) # 选项: random / slot_ratio / system_load slot-allocate-strategy = slot_ratio } } } ``` ### 8.3 资源配置说明 资源相关的可配置项以 `config/seatunnel.yaml` 与当前引擎实现为准;在没有稳定对外能力前,不建议在文档中给出“每槽位 CPU/内存”等固定配置样例,避免与实际实现不一致。 ## 9. 监控和指标 ### 9.1 关键指标 **集群级别**: - `cluster.workers.total`: 已注册工作节点总数 - `cluster.workers.active`: 最近有心跳的工作节点 - `cluster.slots.total`: 所有工作节点的槽位总数 - `cluster.slots.available`: 未分配的槽位 - `cluster.slots.assigned`: 使用中的槽位 **每个工作节点**: - `worker.cpu.available`: 可用 CPU 核心 - `worker.memory.available`: 可用内存(MB) - `worker.slots.total`: 工作节点上的总槽位数 - `worker.slots.assigned`: 已分配的槽位 - `worker.heartbeat.last`: 最后一次心跳时间戳 **每个作业**: - `job.slots.requested`: 作业请求的槽位数 - `job.slots.allocated`: 成功分配的槽位数 - `job.resource.wait_time`: 等待资源的时间 ### 9.2 可观测性 **资源仪表板示例**: ``` 集群资源: 工作节点: 10 (全部健康) 总槽位: 20 可用槽位: 8 利用率: 60% 资源消费者排名: job-123: 6 个槽位 (mysql-cdc → elasticsearch) job-456: 4 个槽位 (kafka → jdbc) job-789: 2 个槽位 (file → s3) 工作节点分布: worker-1: 2/2 槽位 (100%) worker-2: 1/2 槽位 (50%) worker-3: 2/2 槽位 (100%) ... ``` ## 10. 最佳实践 ### 10.1 槽位大小设置 **一般指南**: ``` 每个工作节点的槽位数 = CPU 核心数 - 1 (为操作系统保留 1 个) 示例: 8 核机器 → 6-7 个槽位 16 核机器 → 14-15 个槽位 ``` **每个槽位的内存**: ``` 堆内存 = 总内存 * 0.7 / 槽位数 示例: 32GB 机器, 6 个槽位 每个槽位的堆内存 = 32GB * 0.7 / 6 ≈ 3.7GB ``` ### 10.2 策略选择 **使用 RandomStrategy 当**: - 同构集群(所有工作节点相同) - 简单部署 - 快速分配比完美平衡更重要 **使用 SlotRatioStrategy 当**: - 需要良好的负载均衡 - 混合作业大小 - 中等集群规模(< 100 个工作节点) **使用 SystemLoadStrategy 当**: - 异构集群 - 工作节点具有不同的 CPU/内存 - 优化资源利用率至关重要 ### 10.3 标签使用 **数据局部性**: ```hocon # 按区域/可用区标记工作节点(部署侧:Hazelcast member attributes,示意) # worker-1.attributes.zone = "us-west-1a" # worker-2.attributes.zone = "us-east-1b" # 将作业分配到与数据相同的区域(作业级过滤) env { tag_filter = { zone = "us-west-1a" } } ``` **资源隔离**: ```hocon # 为关键作业分配专用工作节点(部署侧 attributes,示意) # worker-1.attributes.priority = "high" # worker-4.attributes.priority = "normal" env { job.name = "critical-job" tag_filter = { priority = "high" } } ``` ## 11. 相关资源 - [引擎架构](engine-architecture.md) - [DAG 执行](dag-execution.md) - [架构概述](../overview.md) ## 12. 参考资料 ### 进一步阅读 - [Google Borg](https://research.google/pubs/pub43438/) - 大规模集群管理 - [Apache YARN](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html) - Hadoop 中的资源管理 - [Kubernetes](https://kubernetes.io/docs/concepts/scheduling-eviction/kube-scheduler/) - 容器编排和调度 ================================================ FILE: docs/zh/architecture/fault-tolerance/checkpoint-mechanism.md ================================================ --- sidebar_position: 1 title: 检查点机制 --- # 检查点机制 ## 1. 概述 ### 1.1 问题背景 分布式数据处理系统面临容错的关键挑战: - **状态丢失**:如何在失败时保留处理状态? - **精确一次**:如何确保每条记录被精确处理一次? - **分布式一致性**:如何在分布式任务之间创建一致性快照? - **性能**:如何在不阻塞数据处理的情况下执行检查点? - **恢复**:如何在失败后高效恢复状态? ### 1.2 设计目标 SeaTunnel 的检查点机制旨在: 1. **保证精确一次语义**:一致性状态快照 + 两阶段提交 2. **最小化开销**:尽量降低 checkpoint 对数据处理的影响(同步/异步取决于具体实现) 3. **快速恢复**:从最新成功 checkpoint 恢复(耗时取决于状态大小与存储后端) 4. **分布式协调**:协调数百个任务的检查点 5. **可插拔存储**:支持可插拔的 checkpoint storage(具体后端取决于引擎插件与配置) ### 1.3 理论基础 SeaTunnel 的检查点基于 **Chandy-Lamport 分布式快照算法**: **核心思想**:在数据流中插入特殊标记(屏障)。当任务收到屏障时: 1. 快照其本地状态 2. 向下游转发屏障 3. 继续处理 结果:无需暂停整个系统即可获得全局一致性快照。 **参考**:["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf)(Chandy & Lamport,1985) ## 2. 架构设计 ### 2.1 检查点架构 ``` ┌─────────────────────────────────────────────────────────────────┐ │ JobMaster(每个作业一个,内部按 pipeline 管理) │ │ │ │ ┌───────────────────────────────────────────────────────┐ │ │ │ CheckpointCoordinator │ │ │ │ │ │ │ │ • 触发检查点(定期/手动) │ │ │ │ • 生成检查点 ID │ │ │ │ • 跟踪待处理的检查点 │ │ │ │ • 收集任务确认 │ │ │ │ • 持久化完成的检查点 │ │ │ │ • 清理旧检查点 │ │ │ └───────────────────────────────────────────────────────┘ │ │ │ │ │ │ (触发屏障) │ │ ▼ │ └─────────────────────────────────────────────────────────────────┘ │ │ (CheckpointBarrier) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ 工作节点 │ │ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ SourceTask 1 │ │ SourceTask 2 │ │ SourceTask N │ │ │ │ │ │ │ │ │ │ │ │ 1. 接收 │ │ 1. 接收 │ │ 1. 接收 │ │ │ │ 屏障 │ │ 屏障 │ │ 屏障 │ │ │ │ 2. 快照 │ │ 2. 快照 │ │ 2. 快照 │ │ │ │ 状态 │ │ 状态 │ │ 状态 │ │ │ │ 3. ACK │ │ 3. ACK │ │ 3. ACK │ │ │ │ 4. 转发 │ │ 4. 转发 │ │ 4. 转发 │ │ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ │ │ │ │ (屏障传播) │ │ │ │ ▼ ▼ ▼ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ Transform 1 │ │ Transform 2 │ │ Transform N │ │ │ │ │ │ │ │ │ │ │ │ 1. 接收 │ │ 1. 接收 │ │ 1. 接收 │ │ │ │ 屏障 │ │ 屏障 │ │ 屏障 │ │ │ │ 2. 快照 │ │ 2. 快照 │ │ 2. 快照 │ │ │ │ 状态 │ │ 状态 │ │ 状态 │ │ │ │ 3. ACK │ │ 3. ACK │ │ 3. ACK │ │ │ │ 4. 转发 │ │ 4. 转发 │ │ 4. 转发 │ │ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ │ │ │ │ │ ▼ ▼ ▼ │ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ │ SinkTask 1 │ │ SinkTask 2 │ │ SinkTask N │ │ │ │ │ │ │ │ │ │ │ │ 1. 接收 │ │ 1. 接收 │ │ 1. 接收 │ │ │ │ 屏障 │ │ 屏障 │ │ 屏障 │ │ │ │ 2. 准备 │ │ 2. 准备 │ │ 2. 准备 │ │ │ │ 提交 │ │ 提交 │ │ 提交 │ │ │ │ 3. 快照 │ │ 3. 快照 │ │ 3. 快照 │ │ │ │ 状态 │ │ 状态 │ │ 状态 │ │ │ │ 4. ACK │ │ 4. ACK │ │ 4. ACK │ │ │ └──────────────┘ └──────────────┘ └──────────────┘ │ └─────────────────────────────────────────────────────────────────┘ │ │ (收到所有 ACK) ▼ ┌─────────────────────────────────────────────────────────────────┐ │ CheckpointStorage │ │ (例如 localfile/hdfs 等,取决于插件与配置) │ │ │ │ CompletedCheckpoint { │ │ checkpointId: 123 │ │ taskStates: { │ │ SourceTask-1: { splits: [...], offsets: [...] } │ │ SinkTask-1: { commitInfo: XidInfo(...) } │ │ ... │ │ } │ │ } │ └─────────────────────────────────────────────────────────────────┘ ``` ### 2.2 关键数据结构 #### CheckpointCoordinator **职责摘要**: - 触发 checkpoint(按 interval/并发/最小间隔约束) - 跟踪进行中的 `PendingCheckpoint`,收集各 task 的 ACK 与状态 - 将 `CompletedCheckpoint` 持久化到 `CheckpointStorage`,并维护“最近成功 checkpoint” **关键字段(概念级)**: - `checkpointIdCounter`:生成 checkpointId - `pendingCheckpoints`:进行中的 checkpoint 集合 - `checkpointStorage`:状态持久化后端 - 调度参数:`checkpointInterval` / `checkpointTimeout` / `minPauseBetweenCheckpoints` #### PendingCheckpoint 表示进行中的检查点。 **职责摘要**: - 持有本次 checkpoint 的中间态(已 ACK/未 ACK 的 task、收集到的 action 状态与统计) - 在全部 task ACK 后组装 `CompletedCheckpoint`(或触发失败/超时处理) #### CompletedCheckpoint 持久化的检查点数据。 **职责摘要**: - 表示一次成功的 checkpoint 的“可恢复快照”,可被持久化并用于作业恢复 **状态组织方式(概念级)**: - 以“算子/Action + subtask”作为索引维度收集状态 - 每个 subtask 上报一份序列化状态(可能为空,取决于算子是否有状态) ### 2.3 CheckpointStorage 检查点持久化的抽象。 **能力要求(语义级)**: - 持久化:将一次成功 checkpoint 的快照写入外部存储 - 读取:支持读取“最新成功 checkpoint”以及按 checkpointId 定位读取 - 清理:支持按保留策略删除旧 checkpoint - 一致性:写入完成前不得对外可见“半成品”,避免恢复读到不完整快照 **实现**: - `LocalFileStorage`:本地文件存储(localfile 插件) - `HdfsStorage`:基于 Hadoop FileSystem 的存储(hdfs 插件,可通过插件配置指向不同文件系统) ## 3. 检查点流程 ### 3.1 触发检查点 ```mermaid sequenceDiagram participant Timer as 定期计时器 participant Coord as CheckpointCoordinator participant Plan as CheckpointPlan Timer->>Coord: 触发(按配置 interval) Coord->>Coord: 生成 checkpointId(123) Coord->>Coord: 检查条件 Note over Coord: • 最小暂停已过?
    • 未超过最大并发?
    • 先前检查点完成? Coord->>Coord: 创建 PendingCheckpoint(123) Coord->>Plan: 获取起始任务 loop 对每个起始任务 Coord->>Task: 发送 CheckpointBarrierTriggerOperation(123) end Coord->>Coord: 启动超时计时器(按配置 timeout) ``` **触发条件**: 1. 检查点间隔已过(`checkpoint.interval` 或引擎默认值) 2. 检查点之间的最小暂停已过(`min-pause` 或引擎默认值) 3. 触发时机与并发行为以当前实现为准(文档不绑定固定“最大并发 checkpoint”配置项) ### 3.2 屏障传播 ```mermaid sequenceDiagram participant Coord as 协调器 participant Source as SourceTask participant Transform as TransformTask participant Sink as SinkTask Coord->>Source: 触发屏障(123) Source->>Source: 接收屏障 Source->>Source: snapshotState() → 分片、偏移量 Source->>Coord: ACK(state) Source->>Transform: 转发屏障(123) Transform->>Transform: 接收屏障 Transform->>Transform: snapshotState() → 转换状态 Transform->>Coord: ACK(state) Transform->>Sink: 转发屏障(123) Sink->>Sink: 接收屏障 Sink->>Sink: prepareCommit() → commitInfo Sink->>Sink: snapshotState() → 写入器状态 Sink->>Coord: ACK(commitInfo + state) Coord->>Coord: 收到所有 ACK Coord->>Coord: 创建 CompletedCheckpoint ``` **屏障流动规则**: 1. **数据 Source 源任务**:管道起点,从协调器接收屏障 2. **转换任务**:从上游接收,快照,向下游转发 3. **数据 Sink 任务**:管道终点,从上游接收,快照,不转发 **屏障对齐**(对于具有多个输入的任务): 当一个任务有多个上游输入时,需要在本任务处形成一致性快照边界。典型做法是: - 先到达屏障的输入先“对齐等待”(短暂停止向下游发出该输入的后续数据) - 直到所有输入都收到同一 checkpointId 的屏障,才触发本地状态快照,并继续处理 对齐带来的直接影响是:上游数据乱序/不均衡会放大等待时间,因此需要结合并行度、分区策略与 backpressure 做调优。 ### 3.3 状态快照 每种任务类型快照不同的状态: **SourceTask**: - 快照内容:reader 的“分片分配 + 分片内进度(偏移量/游标/切分点)” - 交互行为:上报 ACK(携带状态)给协调器,并向下游转发屏障以推进全局一致性边界 **TransformTask**: - 快照内容:算子状态(无状态算子通常为空状态) - 交互行为:上报 ACK,并转发屏障 **SinkTask**: - 快照内容:writer 的内部状态(例如未刷新的 buffer、事务句柄等) - 提交准备:在 checkpoint 边界生成“可提交但未提交”的提交信息(2PC 的 prepare 阶段) - 交互行为:上报 ACK(携带 writer state + commitInfo),作为管道终点不再转发屏障 ### 3.4 检查点完成 ```mermaid sequenceDiagram participant Coord as CheckpointCoordinator participant Pending as PendingCheckpoint participant Storage as CheckpointStorage participant Tasks as 所有任务 Pending->>Pending: 所有任务已 ACK Pending->>Coord: notifyCheckpointComplete() Coord->>Coord: 创建 CompletedCheckpoint Coord->>Storage: 持久化检查点 Storage-->>Coord: 成功 Note over Coord,Tasks: 持久化成功后,框架/引擎触发提交与清理回调(触发点取决于执行引擎实现) Coord->>Tasks: notifyCheckpointComplete(123) Tasks->>Tasks: 清理资源 Coord->>Storage: 删除旧检查点 ``` **完成步骤**: 1. 所有任务已确认 2. 从 `PendingCheckpoint` 创建 `CompletedCheckpoint` 3. 将检查点持久化到存储 4. 触发数据 Sink 提交(两阶段提交) 5. 通知所有任务完成 6. 清理旧检查点(保留最后 N 个) ### 3.5 检查点超时 协调器为每个进行中的 checkpoint 启动超时计时。 **超时触发后的语义**: - 将该次 checkpoint 标记为失败并清理其进行中状态 - 作业继续运行(仍以“最近一次成功 checkpoint”作为可恢复点) - 是否触发 failover 取决于作业容错策略与失败类型(例如连续失败、关键任务不可用等) **超时处理**: - 默认超时以引擎配置为准(作业可通过 `checkpoint.timeout` 覆盖) - 如果超时,检查点失败 - 作业继续使用先前的检查点 - 下一个检查点将按计划触发 ## 4. 恢复过程 ### 4.1 从检查点恢复 ```mermaid sequenceDiagram participant JM as JobMaster participant Storage as CheckpointStorage participant Source as SourceTask participant Sink as SinkTask JM->>Storage: getLatestCheckpoint() Storage-->>JM: CompletedCheckpoint(123) JM->>JM: 按任务提取状态 JM->>Source: 使用 NotifyTaskRestoreOperation 部署 activate Source Source->>Source: restoreState(splits, offsets) Source->>Source: 寻找到检查点偏移量 Source-->>JM: 就绪 deactivate Source JM->>Sink: 使用 NotifyTaskRestoreOperation 部署 activate Sink Sink->>Sink: restoreWriter(writerState) Sink->>Sink: 恢复未提交的事务 Sink-->>JM: 就绪 deactivate Sink JM->>Source: 开始执行 JM->>Sink: 开始执行 ``` **恢复步骤**: 1. JobMaster 从存储检索最新的 `CompletedCheckpoint` 2. 为每个任务提取状态(按 ActionStateKey 和 subtaskIndex) 3. 使用包含状态的 `NotifyTaskRestoreOperation` 部署任务 4. 任务恢复状态: - **SourceReader**:恢复分片和偏移量,寻找到位置 - **Transform**:恢复转换状态(通常为无) - **SinkWriter**:恢复写入器状态,可能有未提交的事务 5. 任务转换到 READY_START 状态 6. 作业恢复执行 **示例:JDBC 数据源恢复**: 以 JDBC 为例,恢复需要满足两点: - 能把“分片 + 进度(offset/游标)”可靠序列化到 checkpoint - 能在恢复时把读取位置回放到该进度(例如通过主键范围、游标、时间戳或 connector 支持的 offset 语义) ### 4.2 精确一次恢复 检查点恢复 + 数据 Sink 两阶段提交的组合确保精确一次: ``` 检查点 N(已完成): 数据源偏移量:[100, 200, 300] 数据 Sink 准备的提交:[XID-1, XID-2, XID-3] 数据 Sink 提交器提交 XID-1、XID-2、XID-3 ↓ [失败] 从检查点 N 恢复: 1. 恢复数据源偏移量:[100, 200, 300] 2. 数据源从偏移量 100、200、300 开始读取 3. 数据 Sink 写入器恢复状态(可能有未提交的 XID) 4. 数据 Sink 提交器重试提交 XID(幂等) 结果:记录 0-99、100-199、200-299 精确提交一次 从 100+ 开始的记录重新处理但不重复(幂等提交) ``` ## 5. 配置和调优 ### 5.1 检查点配置 ```hocon # 作业级(env):可覆盖 interval/timeout/min-pause env { checkpoint.interval = 60000 checkpoint.timeout = 600000 min-pause = 10000 } ``` 引擎侧(`config/seatunnel.yaml`)配置 checkpoint storage(示意): ```yaml seatunnel: engine: checkpoint: storage: type: hdfs max-retained: 3 plugin-config: namespace: /tmp/seatunnel/checkpoint_snapshot ``` 说明: - BATCH 模式下如果作业 env 未配置 `checkpoint.interval`,当前实现会禁用 checkpoint(以源码实现为准)。 - checkpoint storage 主要由引擎侧配置管理;作业级配置不应假设可以随意指定 storage type/path。 ### 5.2 调优指南 **检查点间隔**: - **短间隔(10-30s)**:快速恢复,但开销更高 - **中间隔(60-120s)**:平衡(推荐) - **长间隔(300-600s)**:低开销,但恢复较慢 **权衡**: - 更短的间隔 → 更频繁的 I/O → 更高的存储成本 - 更长的间隔 → 更少的开销 → 更长的恢复时间 **经验法则**:将间隔设置为可容忍的恢复时间(数据丢失窗口)。 **检查点超时**: - 应该 >> 检查点间隔 - 取决于状态大小和存储速度 - 默认值以引擎配置为准;建议结合状态大小与存储后端能力设置 **并发行为**: - 并发 checkpoint 的能力与策略以当前实现为准;架构文档不绑定固定的“最大并发 checkpoint”配置项 **存储选择**: - **localfile**:仅测试/单机场景,无 HA - **hdfs**:生产环境常用(hdfs 插件基于 Hadoop FileSystem,可通过插件配置对接不同文件系统后端) ## 6. 性能优化 ### 6.1 异步检查点 异步 checkpoint 能降低对数据处理主路径的阻塞(是否异步、异步程度取决于具体实现): 核心思路是把“生成快照引用/拷贝(快)”与“序列化 + 上传(慢)”解耦: - 任务线程快速冻结一份一致性快照(或引用)后立即继续处理 - 后台线程异步完成序列化与外部存储写入 这样可以降低对数据处理主路径的阻塞,但也需要关注异步积压导致的内存压力。 ### 6.2 增量检查点(未来) 仅检查点更改的状态: - 完整 checkpoint:第一次需要上传全量状态 - 增量 checkpoint:后续只上传变化部分,并以链式/引用方式组织快照 **好处**: - 减少检查点时间 - 降低存储 I/O - 更快的检查点完成 **挑战**: - 更复杂的状态管理 - 需要跟踪状态变化 - 恢复需要增量链 ### 6.3 本地状态后端(未来) 在本地存储热状态,仅检查点摘要: 典型做法是把热状态存到本地(例如 RocksDB),checkpoint 时只上传“可恢复的快照引用/元数据”,从而降低远端存储压力。 ## 7. 最佳实践 ### 7.1 状态大小优化 **1. 保持状态小**: - 避免把“可重放的数据本身”放进状态(会放大 checkpoint 体积与时延) - 只保存“可定位读取位置”的最小信息(offset/游标/分片进度),把数据重放交给上游存储或 connector 的读取语义 **2. 使用高效的序列化**: - 优先使用 Protobuf、Kryo 而不是 Java 序列化 - 压缩大状态(gzip、snappy) ### 7.2 监控 **关键指标(示例,名称以实际 metrics 实现为准)**: - checkpoint_duration:从触发到完成的时间 - checkpoint_size:持久化检查点的大小 - checkpoint_failure_rate:失败检查点的比例 - checkpoint_alignment_duration:屏障对齐所花费的时间 **告警**: - 告警阈值需结合业务可接受的恢复窗口与存储后端能力制定 - 如果在 2x 间隔内没有完成检查点则告警 ### 7.3 故障排除 **问题**:检查点超时 **可能原因**: 1. 任务卡住(数据处理缓慢) 2. 大状态(序列化/上传缓慢) 3. 慢速存储(网络/磁盘 I/O) 4. 屏障对齐缓慢(数据倾斜) **解决方案**: - 增加检查点超时 - 优化状态大小 - 使用更快的存储 - 调整并行度 **问题**:高检查点开销 **可能原因**: 1. 检查点间隔太短 2. 大状态大小 3. 慢速存储 **解决方案**: - 增加检查点间隔 - 优化状态大小 - 启用增量检查点(可用时) ## 8. 相关资源 - [架构概览](../overview.md) - [设计理念](../design-philosophy.md) - [引擎架构](../engine/engine-architecture.md) - [数据 Sink 架构](../api-design/sink-architecture.md) - [精确一次语义](exactly-once.md) ## 9. 参考资料 ### 学术论文 - Chandy, K. M., & Lamport, L. (1985). ["Distributed Snapshots: Determining Global States of Distributed Systems"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Carbone, P., et al. (2017). ["State Management in Apache Flink"](http://www.vldb.org/pvldb/vol10/p1718-carbone.pdf) ### 进一步阅读 - [Apache Flink 检查点](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/datastream/fault-tolerance/checkpointing/) - [Spark 结构化流检查点](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovering-from-failures-with-checkpointing) ================================================ FILE: docs/zh/architecture/fault-tolerance/exactly-once.md ================================================ --- sidebar_position: 2 title: 精确一次语义 --- # 精确一次语义 ## 1. 概述 ### 1.1 问题背景 分布式数据处理面临基本的交付保证挑战: - **至多一次**: 记录可能丢失(对关键数据不可接受) - **至少一次**: 记录可能重复(导致计数错误、重复收费) - **精确一次**: 每条记录恰好处理一次(理想但复杂) **实际影响**: ``` 场景: 金融交易处理 至少一次: 交易 $100 处理两次 → 用户被收费 $200 ❌ 精确一次: 交易 $100 处理一次 → 用户被收费 $100 ✅ ``` ### 1.2 设计目标 SeaTunnel 的精确一次语义旨在: 1. **端到端语义**: 在启用 checkpoint 且外部系统支持事务/幂等提交等前提下,尽量提供可验证的一致性语义(避免丢失或重复可见) 2. **透明实现**: 框架处理复杂性,用户最少配置 3. **性能效率**: 在维护保证的同时最小化开销 4. **故障弹性**: 在任务/工作节点/主节点故障时维护保证 5. **广泛适用性**: 支持事务型和非事务型目标端 ### 1.3 一致性级别 | 级别 | 保证 | 用例 | 实现 | |------|------|------|------| | **至多一次** | 无重复,可能丢失 | 非关键日志 | 无重试 | | **至少一次** | 无丢失,可能重复 | 幂等处理 | 重试但无事务 | | **精确一次** | 无丢失,无重复 | 金融、计费、审计 | 检查点 + 两阶段提交 | ## 2. 理论基础 ### 2.1 Chandy-Lamport 算法 **概念**: 无需停止整个系统的分布式快照。 **机制**: 1. 协调器向数据流注入**屏障**(标记) 2. 收到屏障后,每个算子: - 快照其本地状态 - 将屏障转发到下游 3. 当所有算子都完成快照时,我们有一个**一致的全局快照** **关键属性**: 快照表示跨分布式系统状态的一致切割。 ### 2.2 两阶段提交协议 **概念**: 跨分布式参与者的原子提交。 **阶段**: 1. **准备阶段**: 所有参与者准备(尚无副作用) 2. **提交阶段**: 协调器决定提交/中止,所有参与者执行 **在 SeaTunnel 中**: - **准备**: 检查点期间的 `SinkWriter.prepareCommit(...)` - **提交**: 检查点完成后的 `SinkCommitter.commit()` ## 3. 精确一次架构 ### 3.1 端到端流水线 ``` ┌──────────────────────────────────────────────────────────────┐ │ 数据源 │ │ • 从外部系统读取 │ │ • 跟踪偏移量/位置 │ │ • 在检查点中快照偏移量 │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ 检查点屏障 ┌──────────────────────────────────────────────────────────────┐ │ 转换器 │ │ • 处理记录 │ │ • 快照转换器状态(如果有) │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ 检查点屏障 ┌──────────────────────────────────────────────────────────────┐ │ 目标端写入器 │ │ • 缓冲写入 │ │ • prepareCommit(checkpointId) → 生成 CommitInfo (阶段 1) │ │ • 快照写入器状态 │ └──────────────────────────────┬───────────────────────────────┘ │ │ CommitInfo ▼ ┌──────────────────────────────────────────────────────────────┐ │ CheckpointCoordinator │ │ • 收集所有 CommitInfos │ │ • 持久化 CompletedCheckpoint │ │ • 触发提交/回调(触发点取决于执行引擎实现) │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ 目标端提交器 │ │ • commit(CommitInfos) → 应用变更 (阶段 2) │ │ • 必须是幂等的 │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ 外部目标端 (变更可见) ``` ### 3.2 关键组件 **数据 Source 源偏移量管理**: Source 侧要想参与端到端精确一次,通常需要满足: - **可追踪进度**: 读取过程持续维护“已处理到哪里”(如 Kafka offset、文件 position、CDC LSN 等) - **可快照**: 在 checkpoint 时将进度写入状态后端(属于检查点状态的一部分) - **可提交/可确认**: 在 checkpoint 成功后再将进度提交到外部系统(例如提交 offset) - **幂等提交**: 由于重试、故障转移可能触发重复提交,提交动作必须可重放且结果一致 **目标端两阶段提交**: Sink 侧两阶段提交(2PC)的语义拆分: - **Writer(阶段 1 / prepare)** - 将写入先落到“暂不可见”的位置(事务缓冲、临时文件、暂存表/分区等) - 在 barrier 到达时执行 prepare:封存本轮写入,并产出 CommitInfo(例如事务 ID、临时路径、批次号) - 将 CommitInfo 上报给协调器并随 CompletedCheckpoint 一起持久化 - **Committer(阶段 2 / commit)** - 仅在 checkpoint 完成后运行 commit(CommitInfos),使外部副作用“变得可见”(提交事务、原子重命名、发布 batch) - **必须幂等**:重复提交同一 CommitInfo 不能产生重复数据;典型做法是利用外部系统的事务 ID / 唯一键 / 幂等 API ## 4. 实现模式 ### 4.1 事务型目标端(XA) **典型场景**: 支持 XA/2PC 的事务型数据库等 **实现**: 实现要点: - Writer 使用 XA/事务能力将写入暂存于事务中 - 在 prepareCommit 阶段产出可被提交器识别的事务标识(CommitInfo) - Committer 在 checkpoint 完成后提交事务,并对重复 commit 做幂等处理 **优点**: - 强一致性保证 - 失败时自动回滚 **缺点**: - 需要数据库 XA 支持 - 更高延迟(2PC 开销) - 准备阶段期间锁争用 ### 4.2 幂等目标端(Upsert) **典型场景**: 支持 upsert/merge 或自然幂等写入的目标端(例如按主键覆盖写入的存储) **实现**: 实现要点: - 为每条记录选择一个确定性的幂等键(通常来自主键/业务唯一键) - 外部系统使用“按键覆盖/更新”(Upsert)语义:同一幂等键多次写入,最终只保留一个结果 - prepareCommit 只需要保证批次边界(例如 flush 缓冲),不一定需要单独的 commit 阶段 **关键**: 相同主键 → 相同文档 → 幂等更新 **优点**: - 无事务开销 - 更低延迟 **缺点**: - 需要唯一键 - 无法处理复杂事务 ### 4.3 基于日志的目标端(Kafka) **实现**: 实现要点: - 使用 Kafka 事务能力将一个 checkpoint 边界内的写入纳入同一个事务 - prepareCommit 阶段完成 flush 并产出事务标识(CommitInfo) - commit 阶段提交事务,使消息对下游消费者可见 - 对故障恢复时的重复提交,需要依赖 Kafka 事务/幂等机制保证不会产生重复可见结果 ### 4.4 文件目标端(原子重命名) **实现**: 实现要点: - Writer 将数据写入临时路径/临时文件(对外不可见) - prepareCommit 阶段封存临时文件并产出 CommitInfo(临时路径 + 目标路径) - Committer 只做“原子可见化”动作(例如原子重命名/原子移动) - 需要确认底层文件系统对 rename/move 的原子性语义;在对象存储上往往需要额外设计(否则不能直接宣称精确一次) **关键**: 原子重命名确保文件要么完全可见要么不可见。 ## 5. 故障场景和恢复 ### 5.1 检查点前任务故障 ``` 时间线: t0: 检查点 N 完成 t1: 处理记录 [1000-2000] t2: 任务失败 ❌ t3: 从检查点 N 恢复 t4: 重新处理记录 [1000-2000] 结果: ✅ 无数据丢失(记录重新处理) ✅ 无重复(故障前未提交任何内容) ``` ### 5.2 prepareCommit 后任务故障 ``` 时间线: t0: 检查点 N 进行中 t1: SinkWriter.prepareCommit(...) → XID-123 已准备 t2: 任务失败 ❌ (提交前) t3: 从检查点 N-1 恢复 t4: 重新处理记录 t5: 新的 prepareCommit(...) → XID-124 已准备 t6: 提交器提交 XID-124 结果: ✅ XID-123 从未提交(超时后自动回滚) ✅ XID-124 已提交(正确数据) ``` ### 5.3 提交期间提交器故障 ``` 时间线: t0: 检查点 N 完成 t1: 提交器开始提交 [XID-100, XID-101, XID-102] t2: 提交 XID-100 ✅ t3: 提交器失败 ❌ (XID-101, XID-102 未提交) t4: 新提交器重试 [XID-100, XID-101, XID-102] t5: 提交 XID-100 (已提交,幂等) ✅ t6: 提交 XID-101 ✅ t7: 提交 XID-102 ✅ 结果: ✅ 所有 XID 最终提交 ✅ 无重复(幂等提交) ``` ### 5.4 网络分区 ``` 时间线: t0: SinkWriter 准备 XID-200 t1: 检查点完成 t2: 提交器发送 commit(XID-200) t3: 网络分区 ⚠️ (提交成功,但 ACK 丢失) t4: 提交器重试 commit(XID-200) t5: XID-200 已提交(幂等) 结果: ✅ 数据恰好提交一次 ✅ 幂等性防止重复 ``` ## 6. 幂等性要求 ### 6.1 为什么幂等性很重要 **问题**: 网络故障、重试和故障转移可能导致重复的提交尝试。 **解决方案**: 提交器操作必须是幂等的。 典型对比: - **非幂等提交**: 重试一次就会额外插入一份数据(产生重复) - **幂等提交**: 重试多次与提交一次效果一致(例如使用唯一键约束/Upsert/事务 ID 去重) ### 6.2 实现幂等性 **策略 1: 检查后执行** 要点: - 提交前先查询“该 CommitInfo 是否已完成提交”(通过事务表、元数据表、外部系统 API) - 已提交则直接返回成功;未提交则提交并记录结果 **策略 2: 数据库级幂等性** 要点: - 使用唯一约束/唯一索引来承载“去重键”(事务 ID / 批次 ID / checkpointId) - 将“写入去重标记”和“应用外部副作用”放在同一事务或同一原子语义内,避免部分成功导致的不一致 **策略 3: 自然幂等性(XA)** 要点: - 依赖 XA 协议本身对重复 commit 的处理语义 - 对“已提交/不存在”的错误码进行兼容处理,将其视为幂等成功 ## 7. 性能考虑 ### 7.1 检查点间隔权衡 ``` 短间隔(10-30s): ✅ 快速恢复(重新处理更少) ❌ 更高开销(频繁快照) ❌ 更多提交操作 长间隔(5-10分钟): ✅ 更低开销(快照更少) ❌ 恢复更慢(重新处理更多) ✅ 更少提交操作 ``` **建议**: 大多数工作负载 60-120 秒 ### 7.2 批量大小优化 优化思路: - 使用批量写入将外部系统交互的固定开销摊薄(例如每 1000 条 flush 一次) - 批量过大可能增加延迟与内存占用;批量过小会增加外部 I/O 次数 **影响**: 1000x 批量 → ~10x 吞吐量提升 ### 7.3 异步检查点 优化思路: - 在 barrier 到达时尽快做“轻量快照”(例如复制状态引用/增量快照元数据) - 将序列化与上传等重 I/O 工作放到异步线程执行,减少对主处理线程的阻塞 - 需要权衡:异步快照会增加内存峰值(需要暂存 snapshot),并要求正确处理并发可见性 **影响**: 快照上传时数据处理继续 ## 8. 配置 ### 8.1 启用精确一次 ```hocon env { # 检查点配置 checkpoint.interval = 60000 # 60 秒 checkpoint.timeout = 600000 # 10 分钟 # 精确一次模式(vs 至少一次) # 使用事务型目标端时这是隐式的 } ``` ### 8.2 数据源配置 **Kafka**: ```hocon source { Kafka { bootstrap.servers = "localhost:9092" topic = "my_topic" # Kafka 消费者偏移量提交 commit_on_checkpoint = true # 检查点后提交偏移量 } } ``` **JDBC**: ```hocon source { JDBC { url = "jdbc:mysql://..." # 基于查询的数据源(幂等重新处理) query = "SELECT * FROM table WHERE id >= ? AND id < ?" } } ``` ### 8.3 目标端配置 **JDBC (XA)**: ```hocon sink { JDBC { url = "jdbc:mysql://..." # 启用 XA 事务 xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" is_exactly_once = true } } ``` **Kafka (事务)**: ```hocon sink { Kafka { bootstrap.servers = "localhost:9092" topic = "output_topic" # Kafka 事务 transaction.id = "seatunnel-kafka-sink" enable.idempotence = true } } ``` ## 9. 测试精确一次 ### 9.1 功能测试 建议的功能测试步骤: 1. 向数据源注入固定集合的记录(可重复、可计数、最好带主键) 2. 触发/等待至少一个 checkpoint 完成 3. 在关键窗口注入故障(例如 prepareCommit 之后、commit 之前;或 barrier 对齐期间) 4. 恢复后继续运行并结束作业 5. 验证输出端:输入计数 = 输出计数,且基于主键/去重键无重复 ### 9.2 混沌测试 建议的混沌测试维度: - 随机杀任务/杀 worker/重启 master - 注入网络延迟、短暂网络分区、外部存储抖动 - 暂停/延迟 checkpoint 触发,模拟对齐与上传压力 验收标准: - 输入计数与输出计数一致 - 输出端无重复(主键/去重键唯一) - 对关键失败窗口(prepareCommit/commit)覆盖到位 ### 9.3 监控验证 ``` 要跟踪的指标: source.records_read = 1,000,000 sink.records_written = 1,000,000 sink.records_committed = 1,000,000 ✅ 所有计数匹配 → 精确一次验证 ``` ## 10. 最佳实践 ### 10.1 选择适当的目标端 **使用事务型目标端(XA)用于**: - 金融交易 - 计费系统 - 审计日志 - 关键数据 **使用幂等目标端用于**: - 高吞吐量场景 - 可接受最终一致性 - 无事务支持 ### 10.2 处理有毒记录 处理建议: - 明确“有毒记录”的判定范围(格式错误/约束冲突/不可恢复的业务异常) - 选择策略:写入死信队列(DLQ)并告警、跳过并计数、或触发失败(强一致场景) - 与精确一次语义的关系:跳过会破坏端到端“无丢失”,但可能是可接受的业务权衡;需在文档/配置中显式声明 ### 10.3 监控检查点健康 **关键指标**: - `checkpoint.duration`: 应 < 间隔的 10% - `checkpoint.failure_rate`: 应 < 1% - `checkpoint.size`: 监控随时间增长 **警报**: ``` 如果 checkpoint.duration > 300s 则告警 如果 checkpoint.failure_rate > 5% 则告警 如果在 2x 间隔内无检查点则告警 ``` ## 11. 相关资源 - [检查点机制](checkpoint-mechanism.md) - [目标端架构](../api-design/sink-architecture.md) - [数据源架构](../api-design/source-architecture.md) - [引擎架构](../engine/engine-architecture.md) ## 12. 参考资料 ### 学术论文 - Chandy & Lamport (1985): ["Distributed Snapshots"](https://lamport.azurewebsites.net/pubs/chandy.pdf) - Gray & Lamport (2006): ["Consensus on Transaction Commit"](https://lamport.azurewebsites.net/pubs/paxos-commit.pdf) - Carbone et al. (2017): ["State Management in Apache Flink"](http://www.vldb.org/pvldb/vol10/p1718-carbone.pdf) ### 进一步阅读 - [两阶段提交协议](https://en.wikipedia.org/wiki/Two-phase_commit_protocol) - [XA 事务](https://pubs.opengroup.org/onlinepubs/009680699/toc.pdf) - [Kafka 精确一次](https://www.confluent.io/blog/exactly-once-semantics-are-possible-heres-how-apache-kafka-does-it/) ================================================ FILE: docs/zh/architecture/features/multi-table.md ================================================ --- sidebar_position: 3 title: 多表同步 --- # 多表同步架构 ## 1. 概述 ### 1.1 问题背景 数据库迁移和 CDC 场景通常需要同步数百张表: - **资源效率**: 如何避免为每张表创建一个作业? - **一致快照**: 如何确保所有表从同一时间点开始? - **模式路由**: 如何将数据路由到正确的目标表? - **独立模式**: 如何处理每张表的不同模式? - **并行写入**: 如何最大化多表的吞吐量? ### 1.2 设计目标 SeaTunnel 的多表同步旨在: 1. **单作业,多表**: 在一个作业中同步数百张表 2. **资源效率**: 跨表共享资源 3. **模式独立**: 每张表维护自己的模式 4. **动态路由**: 根据表标识将记录路由到正确的目标端 5. **水平扩展**: 支持副本写入器以实现高吞吐量 ### 1.3 用例 **数据库迁移**: ```hocon source { MySQL-CDC { # 捕获数据库中的所有表 database-name = "my_db" table-name = ".*" # 正则表达式: 所有表 } } sink { Jdbc { # 写入 PostgreSQL url = "jdbc:postgresql://..." } } ``` **多表 CDC**: ```hocon source { MySQL-CDC { table-name = "order_.*|user_.*|product_.*" # 多个表模式 } } sink { Elasticsearch { # 每张表对应不同的索引 } } ``` ## 2. 核心抽象 ### 2.1 TablePath 用于将记录路由到表的唯一标识符。 TablePath 由三段信息组成: - **databaseName**: 数据库名 - **schemaName**: schema 名(对无 schema 的系统可为空或使用默认值) - **tableName**: 表名 它需要满足两个要求: - **可稳定序列化**: 能被序列化为唯一字符串(例如 `db.schema.table`)并在链路上传播 - **可逆**: 能从字符串/结构化字段反解析回 TablePath **示例**: - my_db.public.orders - my_db.public.users ### 2.2 SeaTunnelRow 带 TableId 记录携带表标识用于路由。 多表场景中,一条记录除了字段本身,还必须携带: - **tableId**: 表标识(通常是 TablePath 的序列化形式) - **rowKind**: 变更类型(INSERT/UPDATE/DELETE 等) 路由侧通过 tableId 还原出 TablePath,再决定写入到哪个目标表/索引。 ### 2.3 SinkIdentifier 目标端写入器的唯一标识符(表 + 副本索引)。 SinkIdentifier 的作用是把“写入目标”精确到: - **表标识**: TablePath/TableIdentifier - **副本索引**: index(用于同一张表的多 writer 副本并行写入) 示例: - (orders, 0), (orders, 1) - (users, 0), (users, 1) ## 3. MultiTableSource 架构 多表 Source 的具体实现取决于 connector(例如 CDC connector 往往以“库/表”为维度产出变更)。 为了让下游能按表路由,核心要求是: - 输出的每条 `SeaTunnelRow` 必须携带 `tableId`(通常为 `TablePath` 的序列化字符串) - 变更流场景还需要携带 `rowKind`(INSERT/UPDATE/DELETE 等),便于下游做正确语义处理 至于“内部是否维护 TablePath→Reader/Enumerator 映射、如何做多表公平调度、是否共享底层连接”等,属于 connector 自身的实现选择,文档不做强绑定描述。 ## 4. MultiTableSink 架构 ### 4.1 结构 MultiTableSink 是一个“按表路由 + 可多副本并行写入”的 Sink: - 内部维护 **TablePath → SeaTunnelSink** 的映射(每张表一个底层 sink) - 通过 **replicaNum** 为每张表创建多个 writer 副本以提升写入吞吐 - 依赖 catalogTables 提供各表 schema 信息(用于写入/类型转换/DDL 处理) - 运行时要求底层 `SinkWriter` 支持多表能力(例如实现 `SupportMultiTableSinkWriter`),以提供主键路由信息与多表资源管理能力;不满足该能力的 sink 不适用于 `MultiTableSink` ### 4.2 写入器: 带副本的多表写入 写入器的关键流程: 1. 从输入记录中解析 TablePath(tableId) 2. 为该表选择一个 writer 副本(replicaIndex) 3. 路由到 (TablePath, replicaIndex) 对应的底层 writer 执行写入 副本选择需要兼顾两类诉求: - **顺序性/一致落点**: 对同一主键(或唯一键)相关的记录尽量路由到同一副本,降低乱序与写入冲突风险 - **吞吐量**: 在不破坏顺序性要求的前提下,尽量分散写入压力 在当前 MultiTableSinkWriter 的实现中,副本选择主要依据“主键信息是否可用”: - 有主键:对主键字段做哈希,稳定映射到某个副本 - 无主键:使用随机策略在副本间分配 这意味着“是否按 rowKind(INSERT/UPDATE/DELETE)切换策略”不是该实现的默认行为;如果需要按 rowKind 细分策略,应以 connector/实现代码为准。 在 checkpoint 边界: - prepareCommit: 汇总所有表/所有副本的 CommitInfo,并打包为多表级提交信息 - snapshotState: 快照所有 writer 状态;恢复时必须能通过 SinkIdentifier 将状态路由回正确的(表,副本) ### 4.3 提交器: 多表提交协调 提交器的核心责任是把多表提交信息“拆回每张表”,并委托给对应表的底层 committer: 1. 解析 commitInfos,将其按 TablePath 分组 2. 对每个表调用对应的 SinkCommitter.commit(tableCommitInfos) 3. 汇总失败列表并按框架约定触发重试/回滚 注意事项: - commit 必须幂等(可能被重试) - 单表提交失败的处理策略需要明确:是整体失败(保守)还是允许部分表推进(取决于端到端一致性要求) - abort/回滚相关的触发点与语义在不同执行引擎中可能不同,不能在文档层面假设一定会对每个子 sink 执行 abort;务必保证整体可重试、commit 幂等 ## 5. 副本机制 ### 5.1 为什么需要副本? **问题**: 每张表的单个写入器成为高吞吐量表的瓶颈。 **解决方案**: 每张表多个副本写入器用于并行写入。 ``` 无副本: orders 表(1000 写入/秒) → [单个写入器] → 瓶颈 有副本(replicaNum=4): orders 表(1000 写入/秒) → [写入器 0] (250 写入/秒) → [写入器 1] (250 写入/秒) → [写入器 2] (250 写入/秒) → [写入器 3] (250 写入/秒) ``` ### 5.2 副本配置 ```hocon sink { Jdbc { url = "..." # 多表配置 multi_table_sink_replica = 4 # 写入器副本数(对所有表生效) } } ``` ### 5.3 副本选择策略 **基于主键哈希(稳定路由)**: 要点: - 以主键(或业务唯一键)做哈希,将同一键稳定映射到同一副本 - 典型映射: $replica = hash(pk) \bmod replicaNum$ **随机(无主键兜底)**: 要点: - 当记录缺少主键字段信息时,无法提供稳定落点 - 使用随机分配在副本间扩散压力,但不保证同一键的顺序性 ## 6. 多表中的模式管理 ### 6.1 独立模式 每张表维护自己的 CatalogTable/Schema: - 运行时根据 TablePath 查询对应的 schema,用于类型转换与写入 - 不同表之间 schema 互不影响,避免“全局 schema”导致的兼容性冲突 ### 6.2 模式演化路由 模式演化需要被路由到“正确的表”,并应用到该表的所有 writer 副本: 1. 从 SchemaChangeEvent 中解析出 TablePath 2. 选择该表对应的 schema/元数据更新逻辑 3. 将变更广播到该表的所有副本 writer,保证后续写入使用一致的 schema ## 7. 数据流示例 ### 7.1 完整流水线 ``` ┌──────────────────────────────────────────────────────────────┐ │ MySQL CDC 数据源 │ │ • 从 100 张表捕获变更 │ │ • 用 TablePath 标记每行 │ └──────────────────────────────┬───────────────────────────────┘ │ ▼ ┌─────────────────────────────────────┐ │ SeaTunnelRow (带 TablePath) │ │ tableId: "my_db.public.orders" │ │ fields: [1, "order-001", 99.99] │ └─────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────┐ │ MultiTableSinkWriter │ │ • 从行中提取 TablePath │ │ • 选择副本(按主键哈希或随机) │ │ • 路由到正确的写入器 │ └──────────────────────────────┬───────────────────────────────┘ │ ┌──────────────────┼──────────────────┐ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ orders │ │ users │ │ products │ │ 写入器 0 │ │ 写入器 0 │ │ 写入器 0 │ │ 写入器 1 │ │ 写入器 1 │ │ 写入器 1 │ │ 写入器 2 │ │ │ │ │ │ 写入器 3 │ │ │ │ │ └──────────────┘ └──────────────┘ └──────────────┘ │ │ │ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ PostgreSQL │ │ PostgreSQL │ │ PostgreSQL │ │ orders │ │ users │ │ products │ └──────────────┘ └──────────────┘ └──────────────┘ ``` ### 7.2 写入流程 ```mermaid sequenceDiagram participant Source as MySQL CDC participant Writer as MultiTableSinkWriter participant OrderWriter as Order 写入器 (副本 0) participant UserWriter as User 写入器 (副本 0) participant PG as PostgreSQL Source->>Writer: Row(tableId="orders", data=[...]) Writer->>Writer: Extract TablePath("orders") Writer->>Writer: Select replica (pk-hash / random) → 0 Writer->>OrderWriter: write(row) OrderWriter->>PG: write Source->>Writer: Row(tableId="users", data=[...]) Writer->>Writer: Extract TablePath("users") Writer->>Writer: Select replica (pk-hash / random) → 0 Writer->>UserWriter: write(row) UserWriter->>PG: write ``` ### 7.3 检查点流程 ```mermaid sequenceDiagram participant CP as CheckpointCoordinator participant Writer as MultiTableSinkWriter participant W1 as Order 写入器 0 participant W2 as Order 写入器 1 participant W3 as User 写入器 0 CP->>Writer: triggerBarrier(checkpointId) Writer->>W1: prepareCommit() W1-->>Writer: CommitInfo(orders, replica=0) Writer->>W2: prepareCommit() W2-->>Writer: CommitInfo(orders, replica=1) Writer->>W3: prepareCommit() W3-->>Writer: CommitInfo(users, replica=0) Writer->>CP: ACK([CommitInfo1, CommitInfo2, CommitInfo3]) ``` ## 8. 性能优化 ### 8.1 副本大小设置 **经验法则**: ``` replicaNum = ceil(表写入速率 / 单个写入器吞吐量) 示例: orders: 10,000 写入/秒 单个写入器: 2,500 写入/秒 replicaNum = ceil(10,000 / 2,500) = 4 ``` ### 8.2 表特定副本 优化思路: - 不同表的写入速率差异很大时,理想情况下应允许按表配置不同的副本数 - 但在当前实现中,`multi_table_sink_replica` 是对所有表生效的全局配置;如果需要“按表覆盖”,需要 connector/框架层提供额外能力 ### 8.3 批量写入 优化思路: - 为每个 (TablePath, replicaIndex) 维护独立缓冲区,避免不同表/不同副本相互干扰 - 达到 batch-size 或超时阈值时触发 flush,将外部系统交互开销摊薄 - 需要关注内存上限:多表 × 多副本 × 批次缓存会放大峰值占用 ## 9. 监控和可观测性 ### 9.1 关键指标 多表场景下建议至少具备以下维度的可观测性(具体指标命名以 connector/引擎实现为准): - 按 `tableId` 维度的写入条数/字节数/延迟 - 按(表,副本)维度的写入分布与队列堆积情况(用于判断是否存在热点) - 全局维度的表数量、writer 数量、整体吞吐与失败重试次数 ### 9.2 监控仪表板 ``` 多表作业: mysql-to-postgres 表: 100 写入器: 250 (平均每张表 2.5 个副本) 吞吐量: 50,000 记录/秒 按吞吐量排名的表: 1. orders: 15,000 记录/秒 (4 个副本) 2. events: 10,000 记录/秒 (4 个副本) 3. users: 5,000 记录/秒 (2 个副本) ... 副本分布: orders: 副本 0: 3,750 记录/秒 (25%) 副本 1: 3,800 记录/秒 (25.3%) 副本 2: 3,700 记录/秒 (24.7%) 副本 3: 3,750 记录/秒 (25%) ``` ## 10. 最佳实践 ### 10.1 表选择 **使用正则表达式模式**: ```hocon source { MySQL-CDC { # 包含特定模式 table-name = "order_.*|user_.*" } } ``` ### 10.2 副本配置 **保守开始**: ```hocon sink { Jdbc { # 从 1 个副本开始,如果出现瓶颈则增加 multi_table_sink_replica = 1 } } ``` **监控和调优**: 如果单副本写入成为瓶颈(例如写入延迟持续升高、队列堆积明显),可逐步增加 `multi_table_sink_replica` 并结合目标端能力评估收益。 ### 10.3 模式管理 **预创建目标表**: ```sql -- 更好: 预创建所有目标表 CREATE TABLE orders (...); CREATE TABLE users (...); CREATE TABLE products (...); ``` **谨慎启用自动创建**: ```hocon sink { Jdbc { # 作业启动阶段:若表不存在则创建(用于首次建表) schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" # 说明:运行时 schema 变更由 CDC source 的 `schema-changes.enabled` 控制; # 是否能自动应用新增/删除列等变更取决于 JDBC 方言与目标端能力。 } } ``` ## 13. 相关资源 - [CatalogTable 和元数据](../api-design/catalog-table.md) - [目标端架构](../api-design/sink-architecture.md) - [DAG 执行](../engine/dag-execution.md) - [模式演化](../../introduction/concepts/schema-evolution.md) ## 14. 参考资料 如需进一步了解 Schema、Sink 语义与 DAG 执行,请从“相关资源”章节继续阅读。 ================================================ FILE: docs/zh/architecture/overview.md ================================================ --- sidebar_position: 1 title: 架构概览 --- # SeaTunnel 架构概览 ## 1. 简介 ### 1.1 设计目标 SeaTunnel 设计为分布式多模态数据集成工具,具有以下核心目标: - **引擎独立性**:将连接器逻辑尽量与执行引擎解耦;连接器可通过转换层适配到不同引擎,具体可用性以连接器能力与引擎支持为准 - **超高性能**:支持高吞吐、低延迟的大规模数据同步 - **容错性**:在启用 checkpoint 且外部系统支持事务/幂等提交等前提下,通过分布式快照与提交协议提供可验证的一致性语义 - **易用性**:提供简单的配置方式和丰富的连接器生态系统 - **可扩展性**:基于插件的架构,便于添加新的连接器和转换组件 ### 1.2 目标场景 - **批量数据同步**:异构数据源之间的大规模批量数据迁移 - **实时数据集成**:支持 CDC 的流式数据捕获和同步 - **数据湖/仓入库**:高效加载数据到数据湖(Iceberg、Hudi、Delta Lake)和数据仓库 - **多表同步**:在单个作业中同步多个表,支持模式演化 ## 2. 整体架构 SeaTunnel 采用分层架构,实现关注点分离和灵活性: ``` ┌─────────────────────────────────────────────────────────────────┐ │ 用户配置层 │ │ (HOCON 配置 / SQL) │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ SeaTunnel API 层 │ │ (数据源 API / 数据 Sink API / 转换 API / 表 API) │ │ │ │ • SeaTunnelSource • CatalogTable │ │ • SeaTunnelSink • TableSchema │ │ • SeaTunnelTransform • SchemaChangeEvent │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ 连接器生态系统 │ │ │ │ [Jdbc] [Kafka] [MySQL-CDC] [Elasticsearch] [Iceberg] ... │ │ (连接器生态) │ └─────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────┐ │ 转换层 │ │ (将 SeaTunnel API 适配到引擎特定 API) │ │ │ │ • FlinkSource/FlinkSink • SparkSource/SparkSink │ │ • 上下文适配器 • 序列化适配器 │ └─────────────────────────────────────────────────────────────────┘ │ ┌─────────────────────┼─────────────────────┐ ▼ ▼ ▼ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ SeaTunnel │ │ Apache │ │ Apache │ │ Engine (Zeta)│ │ Flink │ │ Spark │ │ │ │ │ │ │ │ • 主节点 │ │ • JobManager │ │ • Driver │ │ • 工作节点 │ │ • TaskManager│ │ • Executor │ │ • 检查点 │ │ • State │ │ • RDD/DS │ └──────────────┘ └──────────────┘ └──────────────┘ ``` ### 2.1 层级职责 | 层级 | 职责 | 核心组件 | |-----|------|---------| | **配置层** | 作业定义、参数配置 | HOCON 解析器、SQL 解析器、配置验证 | | **API 层** | 连接器的统一抽象 | 数据源/数据 Sink /转换接口、CatalogTable | | **连接器层** | 数据源/Sink 实现 | 连接器实现(JDBC、Kafka、CDC 等) | | **转换层** | 引擎特定适配 | Flink/Spark 适配器、上下文包装器 | | **引擎层** | 作业执行和资源管理 | 调度、容错、状态管理 | ## 3. 核心组件 ### 3.1 SeaTunnel API API 层提供引擎独立的抽象: #### 数据源 Source API - **SeaTunnelSource**:创建读取器和枚举器的工厂接口 - **SourceSplitEnumerator**:主节点侧组件,负责分片生成和分配 - **SourceReader**:工作节点侧组件,负责从分片读取数据 - **SourceSplit**:表示数据分区的最小可序列化单元 **关键设计**:协调(枚举器)与执行(读取器)分离,实现高效的并行处理和容错。 #### 数据 Sink API - **SeaTunnelSink**:创建写入器和提交器的工厂接口 - **SinkWriter**:工作节点侧组件,负责写入数据 - **SinkCommitter**:多个写入器的提交操作协调器 - **SinkAggregatedCommitter**:聚合提交的全局协调器 **关键设计**:两阶段提交协议(prepareCommit → commit)在外部系统支持事务/幂等提交且启用 checkpoint 的前提下,可提供一致性语义。 #### 转换 API - **SeaTunnelTransform**:数据转换接口 - **SeaTunnelMapTransform**:1:1 转换 - **SeaTunnelFlatMapTransform**:1:N 转换 #### 表 API - **CatalogTable**:完整的表元数据(模式、分区键、选项) - **TableSchema**:模式定义(列、主键、约束) - **SchemaChangeEvent**:表示模式演化的 DDL 变更 ### 3.2 SeaTunnel Engine (Zeta) 原生执行引擎提供: #### 主节点组件 - **CoordinatorService**:管理所有运行中的 JobMaster - **JobMaster**:管理单个作业生命周期、生成物理计划、协调检查点 - **CheckpointCoordinator**:每个管道协调分布式快照 - **ResourceManager**:管理工作节点资源和槽位分配 #### 工作节点组件 - **TaskExecutionService**:部署和执行任务 - **SeaTunnelTask**:执行数据源 Source/转换/数据 Sink 逻辑 - **FlowLifeCycle**:管理数据源 Source/转换/数据 Sink 组件的生命周期 #### 执行模型 ``` LogicalDag → PhysicalPlan → SubPlan (管道) → PhysicalVertex → TaskGroup → SeaTunnelTask ``` ### 3.3 转换层 通过适配器模式实现引擎可移植性: - **FlinkSource/FlinkSink**:将 SeaTunnel API 适配到 Flink 的数据源/Sink 接口 - **SparkSource/SparkSink**:将 SeaTunnel API 适配到 Spark 的 RDD/Dataset 接口 - **上下文适配器**:包装引擎特定的上下文(SourceReaderContext、SinkWriterContext) - **序列化适配器**:桥接 SeaTunnel 和引擎序列化机制 ### 3.4 连接器生态系统 所有连接器遵循标准化结构: ``` connector-[name]/ ├── src/main/java/.../ │ ├── [Name]Source.java # 实现 SeaTunnelSource │ ├── [Name]SourceReader.java # 实现 SourceReader │ ├── [Name]SourceSplitEnumerator.java │ ├── [Name]SourceSplit.java │ ├── [Name]Sink.java # 实现 SeaTunnelSink │ ├── [Name]SinkWriter.java # 实现 SinkWriter │ └── config/[Name]Config.java └── src/main/resources/META-INF/services/ ├── org.apache.seatunnel.api.table.factory.TableSourceFactory └── org.apache.seatunnel.api.table.factory.TableSinkFactory ``` **发现机制**:Java SPI(服务提供者接口)用于动态连接器加载。 ## 4. 数据流模型 ### 4.1 数据读取 Source 端数据流 ``` 数据源 Source │ ▼ ┌─────────────────────┐ │ SourceSplitEnumerator│ (主节点侧) │ • 生成分片 │ │ • 分配给读取器 │ └─────────────────────┘ │ (分片分配) ▼ ┌─────────────────────┐ │ SourceReader │ (工作节点侧) │ • 从分片读取 │ │ • 发送记录 │ └─────────────────────┘ │ ▼ SeaTunnelRow │ ▼ 转换链(可选) │ ▼ SeaTunnelRow │ ▼ ┌─────────────────────┐ │ SinkWriter │ (工作节点侧) │ • 缓冲记录 │ │ • 准备提交 │ └─────────────────────┘ │ (CommitInfo) ▼ ┌─────────────────────┐ │ SinkCommitter │ (协调器) │ • 提交变更 │ └─────────────────────┘ │ ▼ 数据 Sink ``` ### 4.2 基于分片的并行度 - 数据源被划分为**分片**(如文件块、数据库分区、Kafka 分区) - 每个 **SourceReader** 独立处理一个或多个分片 - 动态分片分配实现负载均衡和故障恢复 - 分片状态被检查点化以实现精确一次处理 ### 4.3 管道执行 作业被划分为**管道**(SubPlan): ``` 管道 1: [数据 Source A] → [转换 1] → [数据 Sink A] ↓ 管道 2: [数据 Source B] ───────→ [转换 2] → [数据 Sink B] ``` 每个管道: - 具有独立的并行度配置 - 维护自己的检查点协调器 - 可以并发或顺序执行 ## 5. 作业执行流程 ### 5.1 提交阶段 ```mermaid sequenceDiagram participant Client as 客户端 participant CoordinatorService as 协调服务 participant JobMaster as 作业主控 participant ResourceManager as 资源管理器 Client->>CoordinatorService: 提交作业配置 CoordinatorService->>CoordinatorService: 解析配置 → LogicalDag CoordinatorService->>JobMaster: 创建 JobMaster JobMaster->>JobMaster: 生成物理计划 JobMaster->>ResourceManager: 请求资源 ResourceManager->>JobMaster: 分配槽位 JobMaster->>TaskExecutionService: 部署任务 ``` ### 5.2 执行阶段 1. **任务初始化** - 将任务部署到分配的槽位 - 初始化数据 Source/转换/数据 Sink 组件 - 从检查点恢复状态(如果在恢复中) 2. **数据处理** - SourceReader 从分片拉取数据 - 数据流经转换链 - SinkWriter 缓冲和写入数据 3. **检查点协调** - CheckpointCoordinator 触发检查点 - 检查点屏障流经数据管道 - 任务快照其状态 - 协调器收集确认 4. **提交阶段** - SinkWriter 准备提交信息 - SinkCommitter 协调提交 - 状态持久化到检查点存储 ### 5.3 状态机 **任务状态转换**: ``` CREATED → INIT → WAITING_RESTORE → READY_START → STARTING → RUNNING ↓ FAILED ← ─────────────────────── → PREPARE_CLOSE → CLOSED ↓ CANCELED ``` **作业状态转换**: ``` CREATED → SCHEDULED → RUNNING → FINISHED ↓ ↓ FAILED CANCELING → CANCELED ``` ## 6. 关键特性 ### 6.1 容错 **检查点机制**: - 受 Chandy-Lamport 算法启发的分布式快照 - 检查点屏障在数据流中传播 - 状态存储在可插拔的检查点存储中(HDFS、S3、本地) - 从最新成功的检查点自动恢复 **故障转移策略**: - 任务级故障转移:重启失败的任务和相关管道 - 基于区域的故障转移:最小化对未受影响任务的影响 - 分片重新分配:失败的分片重新分配给健康的工作节点 ### 6.2 精确一次语义 **两阶段提交协议**: 1. **准备阶段**:SinkWriter 在检查点期间准备提交信息 2. **提交阶段**:SinkCommitter 在检查点完成后提交 3. **中止处理**:在提交前失败时回滚 **幂等性**:SinkCommitter 操作必须是幂等的以处理重试 ### 6.3 动态资源管理 - **基于槽位的分配**:细粒度的资源管理 - **基于标签的过滤**:将任务分配到特定的工作节点组 - **负载均衡**:多种策略(随机、槽位比率、系统负载) - **动态扩缩容**:无需重启作业即可添加/移除工作节点(未来特性) ### 6.4 模式演化 - **DDL 传播**:从数据源捕获模式变更(ADD/DROP/MODIFY 列) - **模式映射**:通过管道转换模式变更 - **动态应用**:将模式变更应用到数据 Sink 表 - **兼容性检查**:在应用前验证模式变更 ### 6.5 多表支持 - **单作业多表**:在一个作业中同步数百个表 - **表路由**:根据 TablePath 将记录路由到正确的数据 Sink - **独立模式**:每个表维护自己的模式 - **副本支持**:每个表多个写入器副本以获得更高吞吐量 ## 7. 模块结构 ``` seatunnel/ ├── seatunnel-api/ # 核心 API 定义 │ ├── source/ # 数据源 API │ ├── sink/ # 数据 Sink API │ ├── transform/ # 转换 API │ └── table/ # 表和模式 API │ ├── seatunnel-connectors-v2/ # 连接器实现 │ ├── connector-jdbc/ # JDBC 连接器 │ ├── connector-kafka/ # Kafka 连接器 │ ├── connector-cdc/ # CDC 连接器集合 │ │ ├── connector-cdc-mysql/ # MySQL CDC 连接器 │ └── ... # 更多连接器 │ ├── seatunnel-transforms-v2/ # 转换实现 │ ├── src/ # Transform 实现源码(如:SQL、Filter 等) │ └── ... │ ├── seatunnel-engine/ # SeaTunnel Engine (Zeta) │ ├── seatunnel-engine-core/ # 核心执行逻辑 │ ├── seatunnel-engine-server/ # 服务器组件(主节点/工作节点) │ └── seatunnel-engine-storage/ # 检查点存储 │ ├── seatunnel-translation/ # 引擎转换层 │ ├── seatunnel-translation-flink/ │ └── seatunnel-translation-spark/ │ ├── seatunnel-formats/ # 数据格式处理器 │ ├── seatunnel-format-json/ │ ├── seatunnel-format-avro/ │ └── ... │ ├── seatunnel-core/ # 作业提交和 CLI └── seatunnel-e2e/ # 端到端测试 ``` ## 8. 设计原则 ### 8.1 关注点分离 - **API vs 实现**:清晰的 API 边界支持多种实现 - **协调 vs 执行**:枚举器/提交器(主节点)与读取器/写入器(工作节点)分离 - **逻辑 vs 物理**:LogicalDag(用户意图)与 PhysicalPlan(执行细节)分离 ### 8.2 插件架构 - **基于 SPI 的发现**:连接器通过 Java SPI 动态加载 - **类加载器隔离**:每个连接器使用隔离的类加载器 - **热插拔**:无需重新构建核心即可添加连接器 ### 8.3 引擎独立性 - **统一 API**:相同的连接器代码在任何引擎上运行 - **转换层**:将 API 适配到引擎特定细节 - **无引擎泄漏**:连接器开发人员无需了解引擎知识 ### 8.4 可扩展性 - **水平扩展**:添加工作节点以提高吞吐量 - **基于分片的并行度**:细粒度并行处理 - **无状态工作节点**:工作节点可以动态添加/移除 ### 8.5 可靠性 - **分布式检查点**:跨分布式任务的一致性快照 - **增量状态**:优化大状态的检查点大小 - **精确一次保证**:端到端一致性 ## 9. 下一步 深入了解特定架构组件: - [设计理念](design-philosophy.md) - 核心设计原则和权衡 - [数据 Source 架构](api-design/source-architecture.md) - 数据源 API 设计深入探讨 - [数据 Sink 架构](api-design/sink-architecture.md) - 数据 Sink API 设计深入探讨 - [引擎架构](engine/engine-architecture.md) - SeaTunnel Engine 内部机制 - [检查点机制](fault-tolerance/checkpoint-mechanism.md) - 容错实现 实践指南: - [如何创建您的连接器](../developer/how-to-create-your-connector.md) - [快速入门](../getting-started/locally/quick-start-seatunnel-engine.md) ## 10. 参考资料 ### 10.1 相关概念 - [Apache Flink](https://flink.apache.org/) - 检查点和状态管理的灵感来源 - [Apache Kafka](https://kafka.apache.org/) - 消费者组模型影响了分片分配 - [Chandy-Lamport 算法](https://en.wikipedia.org/wiki/Chandy-Lamport_algorithm) - 分布式快照算法 ================================================ FILE: docs/zh/connectors/changelog/connector-activemq.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-aerospike.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-amazondynamodb.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-amazonsqs.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-assert.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add assert options (#8620)|https://github.com/apache/seatunnel/commit/b159cc0c75|2.3.10| |[Feature][API] Support timestamp with timezone offset (#8367)|https://github.com/apache/seatunnel/commit/e18bfeabd2|2.3.9| |[fix][connector-v2][connector-assert] Optimize Assert Sink verification method (#8356)|https://github.com/apache/seatunnel/commit/5c9159d7cd|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Transform-V2] Support transform with multi-table (#7628)|https://github.com/apache/seatunnel/commit/72c9c4576d|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Fix][API] Fix column length can not be long (#8039)|https://github.com/apache/seatunnel/commit/16cf632d3e|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] Assert support multi-table check (#7687)|https://github.com/apache/seatunnel/commit/c4778a2497|2.3.8| |[Feature][Transform] Add embedding transform (#7534)|https://github.com/apache/seatunnel/commit/3310cfcd34|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Feature][Connector-V2][Assert] Support field type assert and field value equality assert for full data types (#6275)|https://github.com/apache/seatunnel/commit/576919bfab|2.3.4| |[Feature][Connector-V2][Assert] Support check the precision and scale of Decimal type. (#6110)|https://github.com/apache/seatunnel/commit/dd64ed52d4|2.3.4| |[Hotfix][SQL Transform] Fix cast to timestamp, date, time bug (#5812)|https://github.com/apache/seatunnel/commit/de181de02a|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[Fix] Fix log error when multi-table sink close (#5683)|https://github.com/apache/seatunnel/commit/fea4b6f268|2.3.4| |Support config tableIdentifier for schema (#5628)|https://github.com/apache/seatunnel/commit/652921fb75|2.3.4| |[Feature] Add `table-names` from FakeSource/Assert to produce/assert multi-table (#5604)|https://github.com/apache/seatunnel/commit/2c67cd8f3e|2.3.4| |[Improve] Remove useless ReadonlyConfig flatten feature (#5612)|https://github.com/apache/seatunnel/commit/243edfef3d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][connector-assert]support 'DECIMAL' type and fix 'Number' type precision issue (#5479)|https://github.com/apache/seatunnel/commit/d308e27733|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Transform] Add SimpleSQL transform plugin (#4148)|https://github.com/apache/seatunnel/commit/b914d49abf|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Assert] Unified exception for assert connector (#3331)|https://github.com/apache/seatunnel/commit/e74c9bc6fd|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2] Add Clickhouse and Assert Source/Sink Factory (#3306)|https://github.com/apache/seatunnel/commit/9e4a128381|2.3.0| |[Feature][Connector-v2] improve assert sink connector (#2844)|https://github.com/apache/seatunnel/commit/967fec0e93|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[API-DRAFT] [MERGE] update license and pom.xml|https://github.com/apache/seatunnel/commit/5ae8865b7c|2.2.0-beta| |add assert sink to Api draft (#2071)|https://github.com/apache/seatunnel/commit/fc640b52bd|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-cassandra.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-base.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][MySQL CDC] MySQL cdc support start by time (#9735)|https://github.com/apache/seatunnel/commit/b6c5d941b0|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Update catalog table schema of debezium json (#9525)|https://github.com/apache/seatunnel/commit/10cb84435b|2.3.12| |[Improve][Oracle-CDC] Fix oracle rename ddl event missing column type (#9314)|https://github.com/apache/seatunnel/commit/11a23af64c|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve][CDC] Filter heartbeat event (#8569)|https://github.com/apache/seatunnel/commit/1870653393|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][MySQL-CDC]fix recovery task failure caused by binlog deletion (#8587)|https://github.com/apache/seatunnel/commit/087087e592|2.3.10| |[Feature] [Postgre CDC]support array type (#8560)|https://github.com/apache/seatunnel/commit/021af147cc|2.3.10| |[Feature][MySQL-CDC] Support database/table wildcards scan read (#8323)|https://github.com/apache/seatunnel/commit/2116843ce8|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Transform-v2] Add metadata transform (#7899)|https://github.com/apache/seatunnel/commit/699d16552a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Fix][Connector-V2] Fix cdc use default value when value is null (#7950)|https://github.com/apache/seatunnel/commit/3b432125ae|2.3.9| |[Hotfix][CDC] Fix occasional database connection leak when read snapshot split (#7918)|https://github.com/apache/seatunnel/commit/a8d0d4ce77|2.3.9| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Fix][Connector-V2][CDC] SeaTunnelRowDebeziumDeserializationConverters NPE (#7119)|https://github.com/apache/seatunnel/commit/ae81879213|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[Hotfix][CDC] Fix split schema change stream (#7003)|https://github.com/apache/seatunnel/commit/0c3044e3f6|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Postgres-CDC/OpenGauss-CDC] Fix read data missing when restore (#6785)|https://github.com/apache/seatunnel/commit/67c32607e7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Chore] remove useless interface (#6746)|https://github.com/apache/seatunnel/commit/3c1aeb3785|2.3.6| |[Feature] Support listening for message delayed events in cdc source (#6634)|https://github.com/apache/seatunnel/commit/01159ec923|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Improve][CDC] Improve read performance when record not contains schema field (#6571)|https://github.com/apache/seatunnel/commit/e60beb28ec|2.3.5| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Bugfix][cdc base] Fix negative values in CDCRecordEmitDelay metric (#6259)|https://github.com/apache/seatunnel/commit/68978dbb4e|2.3.4| |[BugFix][CDC Base] Fix added columns cannot be parsed after job restore (#6118)|https://github.com/apache/seatunnel/commit/0c593a39e3|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve][CDC] Disable exactly_once by default to improve stability (#6244)|https://github.com/apache/seatunnel/commit/f47495554b|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |[Bugfix][CDC Base] Fix NPE caused by adding a table for restore job (#6145)|https://github.com/apache/seatunnel/commit/8d3f8e4627|2.3.4| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Bugfix][CDC base] Fix CDC job cannot consume incremental data After restore run (#625) (#6094)|https://github.com/apache/seatunnel/commit/37567ebb7e|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Improve][CDC] Disable memory buffering when `exactly_once` is turned off (#6017)|https://github.com/apache/seatunnel/commit/300a624c5b|2.3.4| |[Improve][Zeta] Remove assert key words (#5947)|https://github.com/apache/seatunnel/commit/dcb4549109|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Fix] Fix MultiTableSink restore failed when add new table (#5746)|https://github.com/apache/seatunnel/commit/21503bd771|2.3.4| |[improve][mysql-cdc] Optimize the default value range of mysql server-id to reduce conflicts. (#5550)|https://github.com/apache/seatunnel/commit/5174639463|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Hotfix][CDC] Fix thread-unsafe collection container in cdc enumerator (#5614)|https://github.com/apache/seatunnel/commit/b2f70fd40b|2.3.4| |[Improve][CDC] Use Source to output the CatalogTable (#5626)|https://github.com/apache/seatunnel/commit/3e6a20acfa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Fix]: fix the cdc bug about NPE when the original table deletes a field (#5579)|https://github.com/apache/seatunnel/commit/f5ed47795d|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][CDC] Support for preferring numeric fields as split keys (#5384)|https://github.com/apache/seatunnel/commit/c687050d88|2.3.4| |[Feature][Connector-V2][CDC] Support flink running cdc job (#4918)|https://github.com/apache/seatunnel/commit/5e378831ee|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[Bugfix][cdc] Fix mysql bit column to java byte (#4817)|https://github.com/apache/seatunnel/commit/aae3e913d0|2.3.3| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |[Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057)|https://github.com/apache/seatunnel/commit/0e4190ab2e|2.3.3| |[Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162)|https://github.com/apache/seatunnel/commit/4b4b5f9640|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3| |[Chore] Modify repeat des (#5088)|https://github.com/apache/seatunnel/commit/936afc2a9e|2.3.3| |[Feature][Connector-V2][cdc] Change the time zone to the default time zone (#5030)|https://github.com/apache/seatunnel/commit/3cff923a79|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877)|https://github.com/apache/seatunnel/commit/9a2efa51c7|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[Bug][CDC] Fix TemporalConversions (#4542)|https://github.com/apache/seatunnel/commit/d2094bf2e1|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Feature][CDC] Support add & dorp tables when restore cdc jobs (#4254)|https://github.com/apache/seatunnel/commit/add75d7d5d|2.3.1| |[Feature][CDC][Mysql] Support read database list (#4255)|https://github.com/apache/seatunnel/commit/3ca60c6fed|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Hotfix][Zeta] Fix shuffle checkpoint (#4224)|https://github.com/apache/seatunnel/commit/507ca85611|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Feature][API] Add Metrics for Connector-V2 (#4017)|https://github.com/apache/seatunnel/commit/32e1f91c7a|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][CDC] MySQL CDC supports deserialization of multi-tables (#4067)|https://github.com/apache/seatunnel/commit/21ef45fcca|2.3.1| |fix cdc option rule error (#4018)|https://github.com/apache/seatunnel/commit/ea160429df|2.3.1| |[Bug][CDC] Fix concurrent modify of splits (#3937)|https://github.com/apache/seatunnel/commit/29b04e2405|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Hotfix][SqlServer CDC] fix SqlServerCDC IT failure (#3807)|https://github.com/apache/seatunnel/commit/fd66de5f98|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0| |[doc][connector][cdc] add MySQL CDC Source doc (#3707)|https://github.com/apache/seatunnel/commit/555905b0b8|2.3.0| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[feature][connector][mysql-cdc] add MySQL CDC enumerator (#3481)|https://github.com/apache/seatunnel/commit/ff4b32dc28|2.3.0| |[feature][connector] add mysql cdc reader (#3455)|https://github.com/apache/seatunnel/commit/ae981df675|2.3.0| |[feature][connector][cdc] add cdc reader jdbc related (#3433)|https://github.com/apache/seatunnel/commit/7bf00fb19f|2.3.0| |[feature][connector][cdc] add CDC enumerator base classes (#3419)|https://github.com/apache/seatunnel/commit/9b1821f476|2.3.0| |[feature][Connector-v2][cdc] Add cdc base reader (#3407)|https://github.com/apache/seatunnel/commit/e454b80dcd|2.3.0| |[bigfix][Connector-v2][cdc] move version to 1.6.4 (#3389)|https://github.com/apache/seatunnel/commit/b50b543c3e|2.3.0| |[feature][connector][cdc] CDC base classes (#3363)|https://github.com/apache/seatunnel/commit/2586f305b4|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-mongodb.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-mysql.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-opengauss.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-oracle.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-postgres.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-sqlserver.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc-tidb.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-cdc.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][MySQL CDC] MySQL cdc support start by time (#9735)|https://github.com/apache/seatunnel/commit/b6c5d941b0|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Feature][Connectors-v2] Support Mysql8.4+ for mysql-cdc (#9720)|https://github.com/apache/seatunnel/commit/e338743927|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Improve][API] Add metadata schema into catalog table (#9586)|https://github.com/apache/seatunnel/commit/385814e7f1|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[Fix][Connector-V2] Update catalog table schema of debezium json (#9525)|https://github.com/apache/seatunnel/commit/10cb84435b|2.3.12| |[Fix][Mongo-CDC] Fix the issue where mongo isExactlyOnce defaults to true, causing room to malfunction (#9454)|https://github.com/apache/seatunnel/commit/814b19537c|2.3.12| |[Fix][Connector-V2] Correct typo in batch-size-per-scan option key (#9434)|https://github.com/apache/seatunnel/commit/6cf258127f|2.3.12| |[Fix][Connector-V2] Oracle cdc not update transaction commit when LOB enabled (#9412)|https://github.com/apache/seatunnel/commit/2a25bae6f6|2.3.12| |[Feature][Connector-V2] Jdbc mysql support read tinyint(1) to byte(tinyint) (#9373)|https://github.com/apache/seatunnel/commit/7b87aa6f12|2.3.12| |[Improve][Oracle-CDC] Remove duplicate load table names (#9357)|https://github.com/apache/seatunnel/commit/90e88cafc5|2.3.12| |[Improve][Oracle-CDC] Fix oracle rename ddl event missing column type (#9314)|https://github.com/apache/seatunnel/commit/11a23af64c|2.3.11| |[Feature][Connector-JDBC] Supprot read Oracle BLOB data as string instead of bytes (#9305)|https://github.com/apache/seatunnel/commit/454a88f81a|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] Fix postgres cdc with debezium_json format can not parse number without scale (#9052)|https://github.com/apache/seatunnel/commit/29cf3a76c7|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Fix] [Mongo-cdc] Fallback to timestamp startup mode when resume token has expired (#8754)|https://github.com/apache/seatunnel/commit/afc990d84e|2.3.10| |[Improve][CDC] Filter ddl for snapshot phase (#8911)|https://github.com/apache/seatunnel/commit/641cc72f2f|2.3.10| |[Improve][Oracle-CDC] Support ReadOnlyLogWriterFlushStrategy (#8912)|https://github.com/apache/seatunnel/commit/6aebdc0384|2.3.10| |[Improve][CDC] Extract duplicate code (#8906)|https://github.com/apache/seatunnel/commit/b922bb90e6|2.3.10| |[Improve][CDC] Filter heartbeat event (#8569)|https://github.com/apache/seatunnel/commit/1870653393|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][MySQL-CDC]fix recovery task failure caused by binlog deletion (#8587)|https://github.com/apache/seatunnel/commit/087087e592|2.3.10| |[Fix][mysql-cdc] Fix GTIDs on startup to correctly recover from checkpoint (#8528)|https://github.com/apache/seatunnel/commit/82e4096c08|2.3.10| |[Feature] [Postgre CDC]support array type (#8560)|https://github.com/apache/seatunnel/commit/021af147cc|2.3.10| |[Feature][MySQL-CDC] Support database/table wildcards scan read (#8323)|https://github.com/apache/seatunnel/commit/2116843ce8|2.3.9| |[hotfix] [connector-cdc-oracle ] support read partition table (#8265)|https://github.com/apache/seatunnel/commit/91b86b2faf|2.3.9| |[Feature][Jdbc] Support sink ddl for postgresql (#8276)|https://github.com/apache/seatunnel/commit/353bbd21a1|2.3.9| |[Improve][E2E] improve oracle e2e (#8292)|https://github.com/apache/seatunnel/commit/9f761b9d32|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8285)|https://github.com/apache/seatunnel/commit/8e29ecf54f|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][CDC] Add 'schema-changes.enabled' options (#8252)|https://github.com/apache/seatunnel/commit/d783f9447c|2.3.9| |[Feature][Mongodb-CDC] Support multi-table read (#8029)|https://github.com/apache/seatunnel/commit/49cbaeb9b3|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add pre-check for table enable cdc (#8152)|https://github.com/apache/seatunnel/commit/9a5da78176|2.3.9| |[Improve][Connector-V2] Fix SqlServer cdc memory leak (#8083)|https://github.com/apache/seatunnel/commit/69cd4ae1a2|2.3.9| |[Feature][Connector-V2]Jdbc chunk split add snapshotSplitColumn config #7794 (#7840)|https://github.com/apache/seatunnel/commit/b6c6dc0438|2.3.9| |[Bug][connectors-v2] fix mongodb bson convert exception (#8044)|https://github.com/apache/seatunnel/commit/b222c13f2f|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Transform-v2] Add metadata transform (#7899)|https://github.com/apache/seatunnel/commit/699d16552a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Bug][Connector-v2] MongoDB CDC Set SeatunnelRow's tableId (#7935)|https://github.com/apache/seatunnel/commit/f3970d6188|2.3.9| |[Fix][Connector-V2] Fix cdc use default value when value is null (#7950)|https://github.com/apache/seatunnel/commit/3b432125ae|2.3.9| |[Hotfix][CDC] Fix occasional database connection leak when read snapshot split (#7918)|https://github.com/apache/seatunnel/commit/a8d0d4ce77|2.3.9| |[Improve][PostgreSQL CDC]-PostgresSourceOptions description error (#7813)|https://github.com/apache/seatunnel/commit/57f47c2064|2.3.9| |[Feature][Connector-V2] SqlServer support user-defined type (#7706)|https://github.com/apache/seatunnel/commit/fb89033273|2.3.8| |[Improve][Connector-V2] Optimize sqlserver package structure (#7715)|https://github.com/apache/seatunnel/commit/9720f118e5|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Fix][Connector-V2] Fix some throwable error not be caught (#7657)|https://github.com/apache/seatunnel/commit/e19d73282e|2.3.8| |[Feature] Support tidb cdc connector source #7199 (#7477)|https://github.com/apache/seatunnel/commit/87ec786bd6|2.3.8| |[Feature][Connector-V2] Support opengauss-cdc (#7433)|https://github.com/apache/seatunnel/commit/81b73515a7|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Hotfix][CDC] Fix package name spelling mistake (#7415)|https://github.com/apache/seatunnel/commit/469112fa64|2.3.8| |[Hotfix][MySQL-CDC] Fix ArrayIndexOutOfBoundsException in mysql binlog read (#7381)|https://github.com/apache/seatunnel/commit/40c5f313eb|2.3.7| |[Improve][Connector-v2] Optimize the count table rows for jdbc-oracle and oracle-cdc (#7248)|https://github.com/apache/seatunnel/commit/0d08b20061|2.3.6| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Fix][Connector-V2][CDC] SeaTunnelRowDebeziumDeserializationConverters NPE (#7119)|https://github.com/apache/seatunnel/commit/ae81879213|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[Hotfix][MySQL-CDC] Fix read gbk varchar chinese garbled characters (#7046)|https://github.com/apache/seatunnel/commit/4e4d2b8ee5|2.3.6| |[Hotfix][CDC] Fix split schema change stream (#7003)|https://github.com/apache/seatunnel/commit/0c3044e3f6|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Hotfix][Postgres-CDC/OpenGauss-CDC] Fix read data missing when restore (#6785)|https://github.com/apache/seatunnel/commit/67c32607e7|2.3.6| |[Improve] Add conditional of start.mode with timestamp in mongo cdc option rule (#6770)|https://github.com/apache/seatunnel/commit/65ae7782c9|2.3.6| |[Fix] Fix ConnectorSpecificationCheckTest failed (#6828)|https://github.com/apache/seatunnel/commit/52d1020eb7|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Chore] remove useless interface (#6746)|https://github.com/apache/seatunnel/commit/3c1aeb3785|2.3.6| |[Improve][mysql-cdc] Support mysql 5.5 versions (#6710)|https://github.com/apache/seatunnel/commit/058f5594a3|2.3.6| |[Improve] Improve read table schema in cdc connector (#6702)|https://github.com/apache/seatunnel/commit/a8c6cc6e0c|2.3.6| |[Improve][mysql-cdc] Fallback to desc table when show create table failed (#6701)|https://github.com/apache/seatunnel/commit/6f74663c08|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Feature] Support listening for message delayed events in cdc source (#6634)|https://github.com/apache/seatunnel/commit/01159ec923|2.3.5| |[Improve][CDC] Optimize split state memory allocation in increment phase (#6554)|https://github.com/apache/seatunnel/commit/fe33422161|2.3.5| |[Improve][CDC] Improve read performance when record not contains schema field (#6571)|https://github.com/apache/seatunnel/commit/e60beb28ec|2.3.5| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve][CDC-Connector]Fix CDC option rule. (#6454)|https://github.com/apache/seatunnel/commit/1ea27afa87|2.3.5| |[Improve][CDC] Optimize memory allocation for snapshot split reading (#6281)|https://github.com/apache/seatunnel/commit/4856645837|2.3.5| |[Fix][Connector-V2] Fix mongodb cdc start up mode option values not right (#6338)|https://github.com/apache/seatunnel/commit/c07f56fbc4|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Feature][Connector]update pgsql-cdc publication for add table (#6309)|https://github.com/apache/seatunnel/commit/2ad7d65236|2.3.5| |[Fix][Oracle-CDC] Fix invalid split key when no primary key (#6251)|https://github.com/apache/seatunnel/commit/b83c40a6f6|2.3.4| |[Bugfix][cdc base] Fix negative values in CDCRecordEmitDelay metric (#6259)|https://github.com/apache/seatunnel/commit/68978dbb4e|2.3.4| |[Improve][Postgres-CDC] Fix name typos (#6248)|https://github.com/apache/seatunnel/commit/2462f1c5f7|2.3.4| |[BugFix][CDC Base] Fix added columns cannot be parsed after job restore (#6118)|https://github.com/apache/seatunnel/commit/0c593a39e3|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve][CDC] Disable exactly_once by default to improve stability (#6244)|https://github.com/apache/seatunnel/commit/f47495554b|2.3.4| |[Improve][Postgres-CDC] Update jdbc fetchsize (#6245)|https://github.com/apache/seatunnel/commit/c25beb9f8a|2.3.4| |[Improve] Support `int identity` type in sql server (#6186)|https://github.com/apache/seatunnel/commit/1a8da1c843|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |[Feature][Oracle-CDC] Support custom table primary key (#6216)|https://github.com/apache/seatunnel/commit/ae4240ca6b|2.3.4| |[Improve][Oracle-CDC] Clean unused code (#6212)|https://github.com/apache/seatunnel/commit/919a91032a|2.3.4| |[Hotfix][Oracle-CDC] Fix state recovery error when switching a single table to multiple tables (#6211)|https://github.com/apache/seatunnel/commit/74cfe1995f|2.3.4| |[Hotfix][Oracle-CDC] Fix jdbc setFetchSize error (#6210)|https://github.com/apache/seatunnel/commit/b7f06ec6d9|2.3.4| |[Feature][Oracle-CDC] Support read no primary key table (#6209)|https://github.com/apache/seatunnel/commit/3cb34c2b71|2.3.4| |[Feature][Connector-V2][Oracle-cdc]Support for oracle cdc (#5196)|https://github.com/apache/seatunnel/commit/aaef22b31b|2.3.4| |[Bugfix][CDC Base] Fix NPE caused by adding a table for restore job (#6145)|https://github.com/apache/seatunnel/commit/8d3f8e4627|2.3.4| |[Feature][CDC] Support custom table primary key (#6106)|https://github.com/apache/seatunnel/commit/1312a1dd27|2.3.4| |[Bugfix][CDC base] Fix CDC job cannot consume incremental data After restore run (#625) (#6094)|https://github.com/apache/seatunnel/commit/37567ebb7e|2.3.4| |[Feature][CDC] Support read no primary key table (#6098)|https://github.com/apache/seatunnel/commit/b42d78de3f|2.3.4| |[Hotfix][Jdbc] Fix jdbc setFetchSize error (#6005)|https://github.com/apache/seatunnel/commit/d41af8a6ed|2.3.4| |[Improve][CDC] Disable memory buffering when `exactly_once` is turned off (#6017)|https://github.com/apache/seatunnel/commit/300a624c5b|2.3.4| |[Improve][Zeta] Remove assert key words (#5947)|https://github.com/apache/seatunnel/commit/dcb4549109|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Bug][CDC] Fix state recovery error when switching a single table to multiple tables (#5784)|https://github.com/apache/seatunnel/commit/37fcff347e|2.3.4| |[Feature][formats][ogg] Support read ogg format message #4201 (#4225)|https://github.com/apache/seatunnel/commit/7728e241e8|2.3.4| |[Improve][CDC] Clean unused code (#5785)|https://github.com/apache/seatunnel/commit/b5a66d3dbe|2.3.4| |[Fix] Fix MultiTableSink restore failed when add new table (#5746)|https://github.com/apache/seatunnel/commit/21503bd771|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[improve][mysql-cdc] Optimize the default value range of mysql server-id to reduce conflicts. (#5550)|https://github.com/apache/seatunnel/commit/5174639463|2.3.4| |[improve][connector-v2][sqlserver-cdc]Unified sqlserver TypeUtils type conversion mode (#5668)|https://github.com/apache/seatunnel/commit/75b814bc3d|2.3.4| |[Dependency]Bump org.apache.avro:avro (#5583)|https://github.com/apache/seatunnel/commit/bb791a6d9e|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |[feature][connector-cdc-sqlserver] add dataType datetimeoffset (#5548)|https://github.com/apache/seatunnel/commit/0cf63eed6d|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |[Hotfix][CDC] Fix thread-unsafe collection container in cdc enumerator (#5614)|https://github.com/apache/seatunnel/commit/b2f70fd40b|2.3.4| |[Feature][CDC] Support MongoDB CDC running on flink (#5644)|https://github.com/apache/seatunnel/commit/8c569b1541|2.3.4| |[Improve][CDC] Use Source to output the CatalogTable (#5626)|https://github.com/apache/seatunnel/commit/3e6a20acfa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Fix]: fix the cdc bug about NPE when the original table deletes a field (#5579)|https://github.com/apache/seatunnel/commit/f5ed47795d|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][CDC] Support for preferring numeric fields as split keys (#5384)|https://github.com/apache/seatunnel/commit/c687050d88|2.3.4| |[Feature][Connector-V2][CDC] Support flink running cdc job (#4918)|https://github.com/apache/seatunnel/commit/5e378831ee|2.3.4| |[Improve][connector-cdc-mysql] avoid listing tables under unnecessary databases (#5365)|https://github.com/apache/seatunnel/commit/3e5d018b35|2.3.4| |[Improve][Docs] Refactor MySQL-CDC docs (#5302)|https://github.com/apache/seatunnel/commit/74530a0461|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[BUG][Connector-V2][Mongo-cdc] Incremental data kind error in snapshot phase (#5184)|https://github.com/apache/seatunnel/commit/ead1c5fd8c|2.3.3| |[Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179)|https://github.com/apache/seatunnel/commit/c0422dbfeb|2.3.3| |[Bugfix][cdc] Fix mysql bit column to java byte (#4817)|https://github.com/apache/seatunnel/commit/aae3e913d0|2.3.3| |[Hotfix]Fix array index anomalies caused by #5057 (#5195)|https://github.com/apache/seatunnel/commit/1c33429506|2.3.3| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |[improve] [CDC Base] Add some split parameters to the optionRule (#5161)|https://github.com/apache/seatunnel/commit/94fd6755e6|2.3.3| |[Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057)|https://github.com/apache/seatunnel/commit/0e4190ab2e|2.3.3| |[Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162)|https://github.com/apache/seatunnel/commit/4b4b5f9640|2.3.3| |[Feature][Connector-V2][CDC] Support string type shard fields. (#5147)|https://github.com/apache/seatunnel/commit/e1be9d7f8a|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Hotfix][Mongodb cdc] Solve startup resume token is negative (#5143)|https://github.com/apache/seatunnel/commit/e964c03dca|2.3.3| |[Hotfix]Fix mongodb cdc e2e instability (#5128)|https://github.com/apache/seatunnel/commit/6f30b29662|2.3.3| |[Feature][Connector-V2][mysql cdc] Conversion of tinyint(1) to bool is supported (#5105)|https://github.com/apache/seatunnel/commit/86b1b7e31a|2.3.3| |[Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923)|https://github.com/apache/seatunnel/commit/d729fcba4c|2.3.3| |[Chore] Modify repeat des (#5088)|https://github.com/apache/seatunnel/commit/936afc2a9e|2.3.3| |[Bugfix][connector-cdc-mysql] Fix listener not released when BinlogClient reuse (#5011)|https://github.com/apache/seatunnel/commit/3287b1d852|2.3.3| |[Feature][Connector-V2][cdc] Change the time zone to the default time zone (#5030)|https://github.com/apache/seatunnel/commit/3cff923a79|2.3.3| |[BugFix] [Connector-V2] [MySQL-CDC] serverId from int to long (#5033) (#5035)|https://github.com/apache/seatunnel/commit/4abc80e111|2.3.3| |[Bugfix][zeta] Fix cdc connection does not close (#4922)|https://github.com/apache/seatunnel/commit/a2d2f2dda8|2.3.3| |[Hotfix][CDC] Fix jdbc connection leak for mysql (#5037)|https://github.com/apache/seatunnel/commit/738925ba10|2.3.3| |[Feature][CDC] Support disable/enable exactly once for INITIAL (#4921)|https://github.com/apache/seatunnel/commit/6d9a3e5957|2.3.3| |[Improve][CDC]change driver scope to provider (#5002)|https://github.com/apache/seatunnel/commit/745c0b9e92|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856)|https://github.com/apache/seatunnel/commit/d827c700f0|2.3.2| |[Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877)|https://github.com/apache/seatunnel/commit/9a2efa51c7|2.3.2| |[Hotfix][CDC] Fix chunk start/end parameter type error (#4777)|https://github.com/apache/seatunnel/commit/c13c031995|2.3.2| |[feature][catalog] Support for multiplexing connections (#4550)|https://github.com/apache/seatunnel/commit/41277d7f78|2.3.2| |[BugFix][Mysql-CDC] Fix Time data type is empty when reading from MySQL CDC (#4670)|https://github.com/apache/seatunnel/commit/e4f973daf7|2.3.2| |[Bug][CDC] Fix TemporalConversions (#4542)|https://github.com/apache/seatunnel/commit/d2094bf2e1|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][CDC] Optimize jdbc fetch-size options (#4352)|https://github.com/apache/seatunnel/commit/fbb60ce1be|2.3.1| |[Improve][CDC] Improve startup.mode/stop.mode options (#4360)|https://github.com/apache/seatunnel/commit/b71d8739d5|2.3.1| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |Update CDC StartupMode and StopMode option to SingleChoiceOption (#4357)|https://github.com/apache/seatunnel/commit/f60ac1a5e9|2.3.1| |[bugfix][cdc-base] Fix cdc base shutdown thread not cleared (#4327)|https://github.com/apache/seatunnel/commit/ac61409bd8|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Feature][CDC] Support add & dorp tables when restore cdc jobs (#4254)|https://github.com/apache/seatunnel/commit/add75d7d5d|2.3.1| |[Improve][CDC][MySQL] Ennable binlog watermark compare (#4293)|https://github.com/apache/seatunnel/commit/b22fb259c8|2.3.1| |[Feature][CDC][Mysql] Support read database list (#4255)|https://github.com/apache/seatunnel/commit/3ca60c6fed|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Hotfix][Zeta] Fix shuffle checkpoint (#4224)|https://github.com/apache/seatunnel/commit/507ca85611|2.3.1| |[improve][jdbc] Reduce jdbc options configuration (#4218)|https://github.com/apache/seatunnel/commit/ddd8f808b5|2.3.1| |[improve][cdc] support sharding-tables (#4207)|https://github.com/apache/seatunnel/commit/5c3f0c9b00|2.3.1| |[Hotfix][CDC] Fix multiple-table data read (#4200)|https://github.com/apache/seatunnel/commit/7f5671d2ce|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Feature][API] Add Metrics for Connector-V2 (#4017)|https://github.com/apache/seatunnel/commit/32e1f91c7a|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Feature][CDC] Support batch processing on multiple-table shuffle flow (#4116)|https://github.com/apache/seatunnel/commit/919653d83e|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][CDC] MySQL CDC supports deserialization of multi-tables (#4067)|https://github.com/apache/seatunnel/commit/21ef45fcca|2.3.1| |[Improve][Connector-V2][SQLServer-CDC] Add sqlserver cdc optionRule (#4019)|https://github.com/apache/seatunnel/commit/78df503392|2.3.1| |fix cdc option rule error (#4018)|https://github.com/apache/seatunnel/commit/ea160429df|2.3.1| |[Bug][CDC] Fix concurrent modify of splits (#3937)|https://github.com/apache/seatunnel/commit/29b04e2405|2.3.1| |[Improve][CDC][base] Guaranteed to be exactly-once in the process of switching from SnapshotTask to IncrementalTask (#3837)|https://github.com/apache/seatunnel/commit/8379aaf876|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][SqlServer CDC] fix SqlServerCDC IT failure (#3807)|https://github.com/apache/seatunnel/commit/fd66de5f98|2.3.1| |[Improve][CDC] Add mysql-cdc source factory (#3791)|https://github.com/apache/seatunnel/commit/356538de8a|2.3.1| |[feature][connector-v2] add sqlServer CDC (#3686)|https://github.com/apache/seatunnel/commit/0f0afb58af|2.3.0| |[doc][connector][cdc] add MySQL CDC Source doc (#3707)|https://github.com/apache/seatunnel/commit/555905b0b8|2.3.0| |[feature][e2e][cdc] add mysql cdc container (#3667)|https://github.com/apache/seatunnel/commit/7696ba1551|2.3.0| |[feature][cdc] Fixed error in mysql cdc under real-time job (#3666)|https://github.com/apache/seatunnel/commit/2238fda300|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[feature][connector][mysql-cdc] add MySQL CDC enumerator (#3481)|https://github.com/apache/seatunnel/commit/ff4b32dc28|2.3.0| |[bugfix][connector-v2] fix cdc mysql reader err (#3465)|https://github.com/apache/seatunnel/commit/1b406b5a31|2.3.0| |[feature][connector] add mysql cdc reader (#3455)|https://github.com/apache/seatunnel/commit/ae981df675|2.3.0| |[feature][connector][cdc] add cdc reader jdbc related (#3433)|https://github.com/apache/seatunnel/commit/7bf00fb19f|2.3.0| |[feature][connector][cdc] add CDC enumerator base classes (#3419)|https://github.com/apache/seatunnel/commit/9b1821f476|2.3.0| |[feature][Connector-v2][cdc] Add cdc base reader (#3407)|https://github.com/apache/seatunnel/commit/e454b80dcd|2.3.0| |[bigfix][Connector-v2][cdc] move version to 1.6.4 (#3389)|https://github.com/apache/seatunnel/commit/b50b543c3e|2.3.0| |[feature][connector][cdc] CDC base classes (#3363)|https://github.com/apache/seatunnel/commit/2586f305b4|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-clickhouse.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Connector-Clickhouse] improve ck batch parallel read by using last batch row sorting value approach, instead of limit offset. (#9801)|https://github.com/apache/seatunnel/commit/5e9990afd5| dev | |[Feature][Connector-Clickhouse] Support Clickhouse multi table source read (#9704)|https://github.com/apache/seatunnel/commit/6e323743ea|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix][Connector-clickhouse] Fix SeaTunnelRow tableId set error (#9585)|https://github.com/apache/seatunnel/commit/01f1caa6fb|2.3.12| |[Improve][connector-clickhouse] Clickhouse support parallelism reading schema (#9446)|https://github.com/apache/seatunnel/commit/3ee0fab3a8|2.3.12| |[Feature][Connector-V2] Support multi-table sink feature for ClickHouse (#9301)|https://github.com/apache/seatunnel/commit/3524895136|2.3.11| |[Fix][Connector-V2] Fix the problem that missing options configuration when building ClickHouse Nodes (#9277)|https://github.com/apache/seatunnel/commit/051d19c3a9|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Fix] [Clickhouse] Parallelism makes data duplicate (#8916)|https://github.com/apache/seatunnel/commit/45345f2738|2.3.10| |[Fix][Connector-V2]Fix Descriptions for CUSTOM_SQL in Connector (#8778)|https://github.com/apache/seatunnel/commit/96b610eb7e|2.3.10| |[improve] update clickhouse connector config option (#8755)|https://github.com/apache/seatunnel/commit/b964189b75|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[hotfix] fix exceptions caused by operator priority in connector-clickhouse when using sharding_key (#8162)|https://github.com/apache/seatunnel/commit/5560e3dab2|2.3.9| |[Imporve][ClickhouseFile] Directly connect to each shard node to obtain the corresponding path (#8449)|https://github.com/apache/seatunnel/commit/757641bada|2.3.9| |[Feature][ClickhouseFile] Support add publicKey to identity (#8351)|https://github.com/apache/seatunnel/commit/287b8c8219|2.3.9| |[Improve][ClickhouseFile] Improve rsync log output (#8332)|https://github.com/apache/seatunnel/commit/179223e3c2|2.3.9| |[Improve][ClickhouseFile] Added attach sql log for better debugging (#8315)|https://github.com/apache/seatunnel/commit/ade428c5fa|2.3.9| |[Chore] delete chinese desc in code (#8306)|https://github.com/apache/seatunnel/commit/a50a8b925f|2.3.9| |[Improve][ClickhouseFile Connector] Unified specifying clickhouse file generation path (#8302)|https://github.com/apache/seatunnel/commit/455f1ed760|2.3.9| |[Improve][ClickhouseFile] Clickhouse supports option configuration when connecting to shard nodes (#8297)|https://github.com/apache/seatunnel/commit/1ded1b6206|2.3.9| |[Imporve][ClickhouseFile] Improve clickhousefile generation parameter configuration (#8293)|https://github.com/apache/seatunnel/commit/753e058fee|2.3.9| |[Improve][ClickhouseFile] ClickhouseFile Connector's rsync transmission supports specifying users (#8236)|https://github.com/apache/seatunnel/commit/e012bd0a4f|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Fix][Connecotr-V2] Fix clickhouse sink does not support composite primary key (#8021)|https://github.com/apache/seatunnel/commit/24d0542595|2.3.9| |[Improve] update clickhouse connector, use factory to create source/sink (#7946)|https://github.com/apache/seatunnel/commit/b69fceceee|2.3.9| |[Fix][Connector-V2] Fixed clickhouse connectors cannot stop under multiple parallelism (#7921)|https://github.com/apache/seatunnel/commit/8d9c6a3714|2.3.9| |Bump commons-io:commons-io from 2.11.0 to 2.14.0 in /seatunnel-connectors-v2/connector-clickhouse (#7784)|https://github.com/apache/seatunnel/commit/f4393a02bf|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Improve some connectors prepare check error message (#7465)|https://github.com/apache/seatunnel/commit/6930a25edd|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Connector-V2][Clickhouse] Add clickhouse.config to the source connector (#7143)|https://github.com/apache/seatunnel/commit/f7994d9ae9|2.3.6| |[Improve] Make ClickhouseFileSinker support tables containing materialized columns (#6956)|https://github.com/apache/seatunnel/commit/87c6adcc2e|2.3.6| |[Improve] [Clickhouse] Remove check when set allow_experimental_lightweight_delete false(#6727) (#6728)|https://github.com/apache/seatunnel/commit/b25e1b1ae5|2.3.6| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve] Speed up ClickhouseFile Local generate a mmap object (#5822)|https://github.com/apache/seatunnel/commit/cf39e29dad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Hotfix][connector-v2][clickhouse] Fixed an out-of-order BUG with output data fields of clickhouse-sink (#5346)|https://github.com/apache/seatunnel/commit/fce9ddaa2b|2.3.4| |[Bugfix][Clickhouse] Fix clickhouse sink flush bug (#5448)|https://github.com/apache/seatunnel/commit/cef03f6673|2.3.4| |[Hotfix][Clickhouse] Fix clickhouse old version compatibility (#5326)|https://github.com/apache/seatunnel/commit/1da49f5a2b|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Connector-V2][Clickhouse] Add clickhouse connector time zone key,default system time zone (#5078)|https://github.com/apache/seatunnel/commit/309b58d12d|2.3.3| |[Bugfix]fix clickhouse source connector read Nullable() type is not null,example:Nullable(Float64) while value is null the result is 0.0 (#5080)|https://github.com/apache/seatunnel/commit/cf3d0bba2e|2.3.3| |[Feature][Connector-V2][Clickhouse] clickhouse writes with checkpoints (#4999)|https://github.com/apache/seatunnel/commit/f8fefa1e57|2.3.3| |[Hotfix][Connector-V2][ClickhouseFile] Fix ClickhouseFile write file failed when field value is null (#4937)|https://github.com/apache/seatunnel/commit/06671474ca|2.3.3| |[Hotfix][connector-clickhouse] fix get clickhouse local table name with closing bracket from distributed table engineFull (#4710)|https://github.com/apache/seatunnel/commit/e5e0cba26d|2.3.2| |[Bug] [Connector-V2] Clickhouse File Connector failed to sink to table with settings like storage_policy (#4172)|https://github.com/apache/seatunnel/commit/e120dc44bc|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Bug] [Connector-V2] Clickhouse File Connector not support split mode for write data to all shards of distributed table (#4035)|https://github.com/apache/seatunnel/commit/3f1dcfc915|2.3.1| |[Hotfix][Connector-V2] Fix connector source snapshot state NPE (#4027)|https://github.com/apache/seatunnel/commit/e39c4988cc|2.3.1| |[Hotfix][Connector-v2][Clickhouse] Fix clickhouse write cdc changelog update event (#3951)|https://github.com/apache/seatunnel/commit/67e6027970|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Improve][Connector-V2][Clickhouse] Improve performance (#3910)|https://github.com/apache/seatunnel/commit/aeceb855f6|2.3.1| |[Improve] [Connector-V2] Remove Clickhouse Fields Config (#3826)|https://github.com/apache/seatunnel/commit/74704c362a|2.3.1| |[Improve][Connector-V2][clickhouse] Special characters in column names are supported (#3881)|https://github.com/apache/seatunnel/commit/9069609c17|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Bug] [Connector-V2] Fix ClickhouseFile Committer Serializable Problems (#3803)|https://github.com/apache/seatunnel/commit/1b26192cb3|2.3.1| |[feature][connector-v2][clickhouse] Support write cdc changelog event in clickhouse sink (#3653)|https://github.com/apache/seatunnel/commit/6093c213bf|2.3.0| |[Connector-V2] [Clickhouse] Improve Clickhouse File Connector (#3416)|https://github.com/apache/seatunnel/commit/e07e9a7cc2|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Clickhouse] Unified exception for Clickhouse source & sink connector (#3563)|https://github.com/apache/seatunnel/commit/04e1743d9e|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[Feature][Connector-V2][Clickhouse]Optimize clickhouse connector data type inject (#3471)|https://github.com/apache/seatunnel/commit/9bd0fc8ee2|2.3.0| |[improve][connector-v2][clickhouse] Fix DoubleInjectFunction (#3441)|https://github.com/apache/seatunnel/commit/9781a6a385|2.3.0| |[feature][api] add option validation for the ReadonlyConfig (#3417)|https://github.com/apache/seatunnel/commit/4f824fea36|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2] Add Clickhouse and Assert Source/Sink Factory (#3306)|https://github.com/apache/seatunnel/commit/9e4a128381|2.3.0| |[Improve][Clickhouse-V2] Clickhouse Support Geo type (#3141)|https://github.com/apache/seatunnel/commit/01cdc4e336|2.3.0| |[Improve][Connector-V2][Clickhouse] Support nest type and array (#3047)|https://github.com/apache/seatunnel/commit/97b5727ec6|2.3.0| |[Feature][Connector-V2-Clickhouse] Clickhouse Source random use host when config multi-host (#3108)|https://github.com/apache/seatunnel/commit/c9583b7f63|2.3.0-beta| |[Improve] [Clickhouse-V2] Clickhouse Support Int128,Int256 Type (#3067)|https://github.com/apache/seatunnel/commit/e118ccea0a|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Connector-V2] [Clickhouse] Fix Clickhouse Type Mapping and Spark Map reconvert Bug (#2767)|https://github.com/apache/seatunnel/commit/f0a1f5013a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V1 & V2] Support unauthorized ClickHouse (#2393)|https://github.com/apache/seatunnel/commit/0e4e2b1230|2.2.0-beta| |[Feature][connector] clickhousefile sink connector support non-root username for fileTransfer (#2263)|https://github.com/apache/seatunnel/commit/704661f1fd|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Bug] [connector-v2] When outputting data to clickhouse, a ClassCastException was encountered (#2160)|https://github.com/apache/seatunnel/commit/a3a2b5d189|2.2.0-beta| |[API-DRAFT] [MERGE] fix merge error|https://github.com/apache/seatunnel/commit/736ac01c89|2.2.0-beta| |merge dev to api-draft|https://github.com/apache/seatunnel/commit/d265597c64|2.2.0-beta| |[api-draft][connector] support Rsync to transfer clickhouse data file (#2080)|https://github.com/apache/seatunnel/commit/02a41902a8|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-cloudberry.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-common.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-console.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] console sink options (#8743)|https://github.com/apache/seatunnel/commit/c439b99f19|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add UT class name check (#8182)|https://github.com/apache/seatunnel/commit/9cf4192fe4|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Core] Add event notify for all connector (#7501)|https://github.com/apache/seatunnel/commit/d71337b0e9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |Update ConsoleSinkFactory.java (#7350)|https://github.com/apache/seatunnel/commit/921662722f|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[Feature] Support multi-table sink (#5620)|https://github.com/apache/seatunnel/commit/81ac173189|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature] [api env] Add job-level configuration for checkpoint timeout. (#5222)|https://github.com/apache/seatunnel/commit/3c13275ed9|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125)|https://github.com/apache/seatunnel/commit/4f89c1d272|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Feature][Zeta] Support shuffle multiple rows by tableId (#4147)|https://github.com/apache/seatunnel/commit/8348f1a108|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2]console sink output content to slf4j log (#3745)|https://github.com/apache/seatunnel/commit/82a5c852d8|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][Console] Add Console option rule (#3322)|https://github.com/apache/seatunnel/commit/efb4711600|2.3.0| |[Improve][connector][console] print subtask index (#3000)|https://github.com/apache/seatunnel/commit/de345783d9|2.3.0-beta| |[Bug][Connector-V2] Fix the bug that can not print SeaTunnelRow correctly (#2749)|https://github.com/apache/seatunnel/commit/9365d35200|2.2.0-beta| |[Feature][Connector-V2] Add iceberg source connector (#2615)|https://github.com/apache/seatunnel/commit/ffc6088a79|2.2.0-beta| |[Bug][ConsoleSinkV2]fix fieldToString StackOverflow and add Unit-Test (#2545)|https://github.com/apache/seatunnel/commit/6f87094569|2.2.0-beta| |[Improve][Console] improve console to printf schema and deepToString fields (#2517)|https://github.com/apache/seatunnel/commit/963387d375|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-databend.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-datahub.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector-V2] Make some sink parameters optional for DataHub (#9229)|https://github.com/apache/seatunnel/commit/7418fae10c|2.3.11| |[Feature][Connector-V2] Datahub support multi-table sink (#9212)|https://github.com/apache/seatunnel/commit/7027162dec|2.3.11| |[improve] datahub sink options (#8744)|https://github.com/apache/seatunnel/commit/88f35bd705|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][DataHub] Unified exception for DataHub sink connector & change package name of DataHub (#3446)|https://github.com/apache/seatunnel/commit/395635fa18|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][DataHub] Add DataHub Sink Factory (#3323)|https://github.com/apache/seatunnel/commit/685978d061|2.3.0| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2]Support datahub sink (#2558)|https://github.com/apache/seatunnel/commit/43600a7049|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-dingtalk.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] dingtalk sink options (#8742)|https://github.com/apache/seatunnel/commit/f2145dcc4f|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][DingTalk] Unified exception for dingtalk sink connector (#3678)|https://github.com/apache/seatunnel/commit/0a09562515|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[improve][connector] The Factory#factoryIdentifier must be consistent with PluginIdentifierInterface#getPluginName (#3328)|https://github.com/apache/seatunnel/commit/d9519d696a|2.3.0| |[Improve][Connector-V2][DingTalk] Add DingTalk Sink Factory (#3324)|https://github.com/apache/seatunnel/commit/56be228ad2|2.3.0| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Add Dingtalk Sink #2257 (#2285)|https://github.com/apache/seatunnel/commit/88a26d5a29|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-doris.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-druid.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-easysearch.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-elasticsearch.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Feature][elasticsearch-connector] Add API key authentication support (#9610)|https://github.com/apache/seatunnel/commit/a2bfe1a530|2.3.12| |[Feature][Connectors-V2][Elasticsearch] Support vector transformation sink (#9330)|https://github.com/apache/seatunnel/commit/a1ce97155f|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Feature][connector-elasticsearch] elasticsearch source support PIT (#9150)|https://github.com/apache/seatunnel/commit/948d588d06|2.3.11| |[Bugfix][Elasticsearch] Fix add column event (#9069)|https://github.com/apache/seatunnel/commit/3455316981|2.3.11| |[Feature][elasticsearch-connector] support elasticsearch sql source (#8895)|https://github.com/apache/seatunnel/commit/8140862795|2.3.10| |[Fix] Fix error log name for SourceSplitEnumerator implements class (#8817)|https://github.com/apache/seatunnel/commit/55ed90ecaf|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add Elasticsearch options (#8623)|https://github.com/apache/seatunnel/commit/d307ab44f2|2.3.10| |[Fix][connector-elasticsearch] support elasticsearch nest type && spark with Array<map> (#8492)|https://github.com/apache/seatunnel/commit/92d2a4a106|2.3.10| |Revert "[Feature][connector-elasticsearch] elasticsearch support nested type (#8462)" (#8485)|https://github.com/apache/seatunnel/commit/c68944893a|2.3.9| |[Feature][connector-elasticsearch] elasticsearch support nested type (#8462)|https://github.com/apache/seatunnel/commit/eaa15e4c8d|2.3.9| |[Feature][Elasticsearch] Support sink ddl (#8412)|https://github.com/apache/seatunnel/commit/a4a38ccff2|2.3.9| |[hotfix][connector-elasticsearch-sink] Convert index to lowercase (#8429)|https://github.com/apache/seatunnel/commit/46fcb237c8|2.3.9| |[Improve][Elasticsearch] Truncate the exception message body for request errors (#8263)|https://github.com/apache/seatunnel/commit/b9d850e61c|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix known directory create and delete ignore issues (#7700)|https://github.com/apache/seatunnel/commit/e2fb679577|2.3.8| |[Feature][Elastic search] Support multi-table source feature (#7502)|https://github.com/apache/seatunnel/commit/29fbeb2547|2.3.8| |[Hotfix][Connector-V2] Fix null not inserted in es (#7493)|https://github.com/apache/seatunnel/commit/a4ba6a171c|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix][Connector-V2][Elasticsearch]Fix sink configuration for DROP_DATA (#7124)|https://github.com/apache/seatunnel/commit/bb9fd516ec|2.3.6| |[Feature][Elasticsearch] Support multi-table sink write #7041 (#7052)|https://github.com/apache/seatunnel/commit/45653e1d22|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Fix][Connector-V2] Remove Some Incorrect Comments and Properties in ElasticsearchCommitInfo|https://github.com/apache/seatunnel/commit/720298775a|2.3.6| |[Bug][Improve][Connector-v2][ElasticsearchSource] Fix behavior when source empty,Support SourceConfig.SOURCE field empty. (#6425)|https://github.com/apache/seatunnel/commit/4e98eb8639|2.3.6| |[Improve][Connector-V2] Add ElasticSearch type converter (#6546)|https://github.com/apache/seatunnel/commit/505c1252bd|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Improve] Implement ElasticSearch connector factory (#6181)|https://github.com/apache/seatunnel/commit/1fd854de67|2.3.4| |[Feature][Connector] add elasticsearch save_mode (#6046)|https://github.com/apache/seatunnel/commit/716a36ac3e|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[BUG][Connector-V2] Fixed conversion exception of elasticsearch array format (#5825)|https://github.com/apache/seatunnel/commit/64f19f25d9|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Chore] Update the es version in the docs. (#4499)|https://github.com/apache/seatunnel/commit/415150635c|2.3.2| |[Improve][ElasticsearchSink]remove useless code. (#4500)|https://github.com/apache/seatunnel/commit/ef44c0d44a|2.3.2| |[Hotfix][Connector-V2][ES] Source deserializer error and inappropriate (#4233)|https://github.com/apache/seatunnel/commit/15530d2785|2.3.2| |[Feature][Connector-V2][ES] Support dsl filter (#4130)|https://github.com/apache/seatunnel/commit/79ca878338|2.3.1| |[Bug][Connector-V2][ES]Fix es field type not support binary(#4240) (#4274)|https://github.com/apache/seatunnel/commit/84f10f2016|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |Shade google common in hadoop (#4222)|https://github.com/apache/seatunnel/commit/5376905075|2.3.1| |Set es text type to string (#4192)|https://github.com/apache/seatunnel/commit/473971b94b|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |Support ES catalog get field mapping (#4167)|https://github.com/apache/seatunnel/commit/72f2418713|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Bug][Connector-V2][ES]Fix es source no data (#4076)|https://github.com/apache/seatunnel/commit/a573b8dbed|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Elasticsearch] Support https protocol (#3997)|https://github.com/apache/seatunnel/commit/79b5cdd9c2|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[hotfix][connector-v2][elasticsearch] Fix bulk refresh operation not locked (#3738)|https://github.com/apache/seatunnel/commit/b6cab90d2f|2.3.0| |[feature][connector-v2][elasticsearch] Support write cdc changelog event in elasticsearch sink (#3673)|https://github.com/apache/seatunnel/commit/3ec47c6848|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][ElasticSearch] Unified exception for ElasticSearch source & sink connector (#3569)|https://github.com/apache/seatunnel/commit/b73944d1dc|2.3.0| |[Improve] [Connector-V2] Bad smell ToArrayCallWithZeroLengthArrayArgument: (#3577)|https://github.com/apache/seatunnel/commit/cc448d98c4|2.3.0| |[Improve][Connector-V2][ElasticSearch] Improve es bulk sink retriable mechanism (#3148)|https://github.com/apache/seatunnel/commit/02ef38eb7a|2.3.0| |[Connector-V2] [E2E] Add missed ElasticSearch E2E module. (#3338)|https://github.com/apache/seatunnel/commit/b2dad4d472|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][Elasticsearch] Support Elasticsearch source (#2821)|https://github.com/apache/seatunnel/commit/ded5481d98|2.3.0| |update (#3149)|https://github.com/apache/seatunnel/commit/59abe4ad62|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Connector-V2] [ElasticSearch] Fix ElasticSearch Connector V2 Bug (#2817)|https://github.com/apache/seatunnel/commit/2fcbbf464a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] new connecotor of Elasticsearch sink(#2326) (#2330)|https://github.com/apache/seatunnel/commit/2a1fd5027f|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-email.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] email connector options (#8983)|https://github.com/apache/seatunnel/commit/7821e824dd|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Transform] Rename sql transform table name from 'fake' to 'dual' (#8298)|https://github.com/apache/seatunnel/commit/e6169684fb|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2]Support multi-table sink feature for email (#7368)|https://github.com/apache/seatunnel/commit/c880b7aa4d|2.3.8| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Email] Unified exception for email connector (#3898)|https://github.com/apache/seatunnel/commit/829261e1a6|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Connector][Email] Add Email Sink Factory (#3326)|https://github.com/apache/seatunnel/commit/0645d11180|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Connector-V2] Add Email sink connector (#2304)|https://github.com/apache/seatunnel/commit/96f2a15e4d|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-fake.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Feature][Connectors-v2] Support auto-increment id for FakeSource (#9505)|https://github.com/apache/seatunnel/commit/3a16b4a4b5|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] fake source options (#8950)|https://github.com/apache/seatunnel/commit/f8c47fb5f4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][API] Support timestamp with timezone offset (#8367)|https://github.com/apache/seatunnel/commit/e18bfeabd2|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Improve][Fake] Improve memory usage when split size is large (#7821)|https://github.com/apache/seatunnel/commit/2d41b024c7|2.3.9| |[Improve][Connector-V2] Time supports default value (#7639)|https://github.com/apache/seatunnel/commit/33978689f5|2.3.8| |[Improve][Connector-V2] Fake supports column configuration (#7503)|https://github.com/apache/seatunnel/commit/39162a4e0b|2.3.8| |[Feature][Core] Add event notify for all connector (#7501)|https://github.com/apache/seatunnel/commit/d71337b0e9|2.3.8| |[Improve][Connector-V2] update vectorType (#7446)|https://github.com/apache/seatunnel/commit/1bba72385b|2.3.8| |[Feature][Connector-V2] Fake Source support produce vector data (#7401)|https://github.com/apache/seatunnel/commit/6937d10ac3|2.3.8| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Feature][Core] Support event listener for job (#6419)|https://github.com/apache/seatunnel/commit/831d0022eb|2.3.5| |[Fix][FakeSource] fix random from template not include the latest value issue (#6438)|https://github.com/apache/seatunnel/commit/6ec16ac46f|2.3.5| |[Improve][Catalog] Use default tablepath when can not get the tablepath from source config (#6276)|https://github.com/apache/seatunnel/commit/f8158bb805|2.3.4| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |FakeSource support generate different CatalogTable for MultipleTable (#5766)|https://github.com/apache/seatunnel/commit/a8b93805ea|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Add default implement for `SeaTunnelSource::getProducedType` (#5670)|https://github.com/apache/seatunnel/commit/a04add6991|2.3.4| |Support config tableIdentifier for schema (#5628)|https://github.com/apache/seatunnel/commit/652921fb75|2.3.4| |[Feature] Add `table-names` from FakeSource/Assert to produce/assert multi-table (#5604)|https://github.com/apache/seatunnel/commit/2c67cd8f3e|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-fake] Optimizing Data Generation Strategies refer to #4004 (#4061)|https://github.com/apache/seatunnel/commit/c7c596a6dc|2.3.1| |[Improve][Connector-V2][Fake] Improve fake connector (#3932)|https://github.com/apache/seatunnel/commit/31f12431d9|2.3.1| |[Feature][Connector-v2][StarRocks] Support write cdc changelog event(INSERT/UPDATE/DELETE) (#3865)|https://github.com/apache/seatunnel/commit/8e3d158c03|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Fake] Unified exception for fake source connector (#3520)|https://github.com/apache/seatunnel/commit/f371ad5825|2.3.0| |[Connector-V2] [Fake] Add Fake TableSourceFactory (#3345)|https://github.com/apache/seatunnel/commit/74b61c33a0|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve] [Engine] Improve Engine performance. (#3216)|https://github.com/apache/seatunnel/commit/7393c47327|2.3.0| |[hotfix][connector][fake] fix FakeSourceSplitEnumerator assigning duplicate splits when restoring (#3112)|https://github.com/apache/seatunnel/commit/98b1feda85|2.3.0-beta| |[improve][connector][fake] supports setting the number of split rows and reading interval (#3098)|https://github.com/apache/seatunnel/commit/efabe6af7f|2.3.0-beta| |[feature][connector][fake] Support mutil splits for fake source connector (#2974)|https://github.com/apache/seatunnel/commit/c28c44b7c9|2.3.0-beta| |[E2E][ST-Engine] Add test data consistency in 3 node cluster and fix bug (#3038)|https://github.com/apache/seatunnel/commit/97400a6f13|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2] Improve fake source connector (#2944)|https://github.com/apache/seatunnel/commit/044f62ef32|2.3.0-beta| |[Improve][Connector-v2-Fake]Supports direct definition of data values(row) (#2839)|https://github.com/apache/seatunnel/commit/b7d9dde6c8|2.3.0-beta| |[Connector-V2] [ElasticSearch] Fix ElasticSearch Connector V2 Bug (#2817)|https://github.com/apache/seatunnel/commit/2fcbbf464a|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Bug] [connector-fake] Fake date calculation error(#2573)|https://github.com/apache/seatunnel/commit/9ea01298f1|2.2.0-beta| |[Bug][ConsoleSinkV2]fix fieldToString StackOverflow and add Unit-Test (#2545)|https://github.com/apache/seatunnel/commit/6f87094569|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Imporve][Fake-Connector-V2]support user-defined-schmea and random data for fake-table (#2406)|https://github.com/apache/seatunnel/commit/a5447528c3|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-base-hadoop.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-base.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-cos.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-ftp.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Improve][Connector-V2] Add remote host verification option for FTP data channels (#9324)|https://github.com/apache/seatunnel/commit/019d69d10a|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Improve][Connector-V2] Ensure that the FTP connector behaves reliably during directory operation (#8959)|https://github.com/apache/seatunnel/commit/b5f0b43fcb|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Fix][Connector-V2][FTP] Fix FTP connector connection_mode is not effective (#7865)|https://github.com/apache/seatunnel/commit/26c528a5ed|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2]Ftp file source support multiple table (#7795)|https://github.com/apache/seatunnel/commit/22fe27a3d6|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Feature][Connector-V2] Ftp file sink suport multiple table and save mode (#7665)|https://github.com/apache/seatunnel/commit/4f812e12ae|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][Connectors-v2-file-ftp] FTP source/sink add ftp connection mode (#6077) (#6099)|https://github.com/apache/seatunnel/commit/f6bcc4d59d|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Core] [Improve] Fix some sonar check error (#3240)|https://github.com/apache/seatunnel/commit/8664bb53a5|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Imporve][Connector-V2] Refactor ftp sink & Add ftp file source (#2774)|https://github.com/apache/seatunnel/commit/4aacbcdd1f|2.2.0-beta| |[Feature][File connector] Support ftp file sink (#2483)|https://github.com/apache/seatunnel/commit/a87e5de80a|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-hadoop.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Feature][Connector-V2] Support hdfs file multi table source read (#9816)|https://github.com/apache/seatunnel/commit/672af255ef| dev | |[Feature][Connector-File-Hadoop]Support multi table sink feature for HdfsFile (#9651)|https://github.com/apache/seatunnel/commit/bb4f743c05|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Improve][Connector-V2] Refactor hdfs file sink connector code structure (#2701)|https://github.com/apache/seatunnel/commit/6129c02567|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file json support (#2451)|https://github.com/apache/seatunnel/commit/84f6b17c15|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file source connector (#2420)|https://github.com/apache/seatunnel/commit/4fb6f2a216|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-jindo-oss.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-local.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] File Source Support filtering files by last modified time. (#9526)|https://github.com/apache/seatunnel/commit/cde4c3d410|2.3.12| |[Feature][Format] Improve maxwell_json,canal_json,debezium_json format add ts_ms and table (#9701)|https://github.com/apache/seatunnel/commit/fb8444b946|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Feature][Sink] File support new format: maxwell_json,canal_json,debezium_json (#9278) (#9336)|https://github.com/apache/seatunnel/commit/a1bfbb20dd|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[feature][connector-file-local] add save mode function for localfile (#7080)|https://github.com/apache/seatunnel/commit/7b2f538310|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Feature][Connectors-V2][File]support assign encoding for file source/sink (#6489)|https://github.com/apache/seatunnel/commit/d159fbe086|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |Add multiple table file sink to base (#6049)|https://github.com/apache/seatunnel/commit/085e0e5fc3|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature] LocalFile sink support multiple table (#5931)|https://github.com/apache/seatunnel/commit/0fdf45f94d|2.3.4| |[Feature] LocalFileSource support multiple table|https://github.com/apache/seatunnel/commit/72be6663ad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Bug][Connector-V2] Fix error option (#2775)|https://github.com/apache/seatunnel/commit/488e561eef|2.2.0-beta| |[Improve][Connector-V2] Refactor local file sink connector code structure (#2655)|https://github.com/apache/seatunnel/commit/6befd599a1|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Local file json support (#2465)|https://github.com/apache/seatunnel/commit/65a92f2496|2.2.0-beta| |[Feature][Connector-V2] Add local file connector source (#2419)|https://github.com/apache/seatunnel/commit/eff595c452|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of local file connector (#2403)|https://github.com/apache/seatunnel/commit/a538daed5c|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-obs.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-oss-jindo.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-file-oss.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Doc][Connector-V2] Update save mode config for OssFileSink (#9303)|https://github.com/apache/seatunnel/commit/40097d7f3e|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve] Added OSSFileCatalog and it's factory (#7458)|https://github.com/apache/seatunnel/commit/9006a205db|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Hotfix][Oss File Connector] fix oss connector can not run bug (#6010)|https://github.com/apache/seatunnel/commit/755bc2a730|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Fix][Connector-V2] Fix file-oss config check bug and amend file-oss-jindo factoryIdentifier (#4581)|https://github.com/apache/seatunnel/commit/5c4f17df20|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Feature][Connector-V2] Add oss sink (#2629)|https://github.com/apache/seatunnel/commit/bb2ad40487|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add oss source connector (#2467)|https://github.com/apache/seatunnel/commit/712b77744e|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-s3.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Fix][Connector-V2] Fixed incorrectly setting s3 key in some cases (#8885)|https://github.com/apache/seatunnel/commit/cf4bab5be2|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| | [improve] update S3File connector config option (#8615)|https://github.com/apache/seatunnel/commit/80cc9fa6ff|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Hotfix][Zeta] Fix the dependency conflict between the guava in hadoop-aws and hive-exec (#7986)|https://github.com/apache/seatunnel/commit/a7837f1f19|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve] Refactor S3FileCatalog and it's factory (#7457)|https://github.com/apache/seatunnel/commit/d928e8b113|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[bigfix][S3 File]:Change the [SCHEMA] attribute of the [S3CONF class] to be non-static to avoid being reassigned after deserialization (#6717)|https://github.com/apache/seatunnel/commit/79bb70101a|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Feature][Connector]add s3file save mode function (#6131)|https://github.com/apache/seatunnel/commit/81c51073bf|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[Chore] Upgrade guava to 27.0-jre (#4238)|https://github.com/apache/seatunnel/commit/4851bee575|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add S3Catalog (#4121)|https://github.com/apache/seatunnel/commit/7d7f506547|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Engine][Checkpoint]Unified naming style (#3714)|https://github.com/apache/seatunnel/commit/bc0bd3bec3|2.3.0| |[Connector][File-S3]Set AK is not required (#3713)|https://github.com/apache/seatunnel/commit/da3c526172|2.3.0| |[Connector&Engine]Set S3 AK to optional (#3688)|https://github.com/apache/seatunnel/commit/4710918b02|2.3.0| |[Connector][S3]Support s3a protocol (#3632)|https://github.com/apache/seatunnel/commit/ae4cc9c1ec|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][S3] Add S3 file source & sink connector (#3119)|https://github.com/apache/seatunnel/commit/f27d68ca9c|2.3.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file-sftp.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Hotfix][Connector-V2][SFTP] Add quote to sftp file names with wildcard characters (#8501)|https://github.com/apache/seatunnel/commit/c5751b001b|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Feature][Connector-V2]Sftp file source support multiple table (#7824)|https://github.com/apache/seatunnel/commit/cfb8760f58|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] sftp file sink suport multiple table and save mode (#7668)|https://github.com/apache/seatunnel/commit/dc4b9898f7|2.3.8| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[BugFix][Connector-file-sftp] Fix SFTPInputStream.close does not correctly trigger the closing of the file stream (#6323) (#6329)|https://github.com/apache/seatunnel/commit/eee881af91|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[Bug Fix] [seatunnel-connectors-v2][SFTP] Fix incorrect exception handling logic (#4720)|https://github.com/apache/seatunnel/commit/dc350e67c3|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-V2][SFTP] Add SFTP file source & sink connector (#3006)|https://github.com/apache/seatunnel/commit/9e496383b8|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-file.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connector-V2] Support hdfs file multi table source read (#9816)|https://github.com/apache/seatunnel/commit/672af255ef| dev | |[Feature][Transform-V2] Support multimodal embeddings (#9673)|https://github.com/apache/seatunnel/commit/12414c4eab| dev | |[Improve][Connector-V2] File Source Support filtering files by last modified time. (#9526)|https://github.com/apache/seatunnel/commit/cde4c3d410|2.3.12| |[Feature][Format] Improve maxwell_json,canal_json,debezium_json format add ts_ms and table (#9701)|https://github.com/apache/seatunnel/commit/fb8444b946|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature] [connector-file] Add configurable sheet_max_rows support for Excel sink connector (#9668)|https://github.com/apache/seatunnel/commit/ea5bc51067|2.3.12| |[Feature][Connector-File-Hadoop]Support multi table sink feature for HdfsFile (#9651)|https://github.com/apache/seatunnel/commit/bb4f743c05|2.3.12| |[Improve][Csv] support configurable CSV delimiter in file connector (#9660)|https://github.com/apache/seatunnel/commit/48fb7ef697|2.3.12| |[Fix][Connector-V2] Update file filter pattern compilation to remove unnecessary quoting (#9658)|https://github.com/apache/seatunnel/commit/b5c7b4ad0e|2.3.12| |[Improve][Connector-V2] Add customizable row delimiter support for text file processing (#9608)|https://github.com/apache/seatunnel/commit/7898e62e01|2.3.12| |[Fix][Connector-File] Fix parquet support user config schema (#9596)|https://github.com/apache/seatunnel/commit/2bdaeb6a07|2.3.12| |[Improve][Connector-file] Add configurable binary chunk size support to BinaryReadStrategy (#9391)|https://github.com/apache/seatunnel/commit/38e87e75a3|2.3.12| |[Feature][Sink] File support new format: maxwell_json,canal_json,debezium_json (#9278) (#9336)|https://github.com/apache/seatunnel/commit/a1bfbb20dd|2.3.12| |[Improve][Connector-V2] Support maxcompute sink writer with timestamp field type (#9234)|https://github.com/apache/seatunnel/commit/a513c495e3|2.3.12| |[Feature][connector-hive] hive sink connector support overwrite mode #7843 (#7891)|https://github.com/apache/seatunnel/commit/6fafe6f4d3|2.3.12| |[Improve][Connector-V2] Add remote host verification option for FTP data channels (#9324)|https://github.com/apache/seatunnel/commit/019d69d10a|2.3.11| |[Doc][Connector-V2] Update save mode config for OssFileSink (#9303)|https://github.com/apache/seatunnel/commit/40097d7f3e|2.3.11| |[Fix][connector-file-base] fix parquet int32 convert error (#9142)|https://github.com/apache/seatunnel/commit/e6413c388e|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Bugfix][Csv] Fix csv format delimiter (#9066)|https://github.com/apache/seatunnel/commit/ff5fc129b8|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Feature][File] Support extract CSV files with different columns in different order (#9064)|https://github.com/apache/seatunnel/commit/74db1cbaac|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |[Improve][File] Add row_delimiter options into text file sink (#9017)|https://github.com/apache/seatunnel/commit/92aa855a34|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Fix][File]use common-csv to read csv file (#8919)|https://github.com/apache/seatunnel/commit/3e64a42838|2.3.10| |[Improve][Connector-V2] Ensure that the FTP connector behaves reliably during directory operation (#8959)|https://github.com/apache/seatunnel/commit/b5f0b43fcb|2.3.10| |[Improve][connector-file-base] Improved multiple table file source allocation algorithm for subtasks (#8878)|https://github.com/apache/seatunnel/commit/44a12cc55c|2.3.10| |[Fix][Connector-V2] Fixed incorrectly setting s3 key in some cases (#8885)|https://github.com/apache/seatunnel/commit/cf4bab5be2|2.3.10| |[Fix][Connector-File] Fix conflicting `file_format_type` requirement (#8823)|https://github.com/apache/seatunnel/commit/6e0d630f7c|2.3.10| |[Feature][Connector-V2] Add `filename_extension` parameter for read/write file (#8769)|https://github.com/apache/seatunnel/commit/78b23c0ef5|2.3.10| |[Improve][Connector-V2] Improve orc read error message (#8751)|https://github.com/apache/seatunnel/commit/d66d9dc9ce|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| | [improve] update S3File connector config option (#8615)|https://github.com/apache/seatunnel/commit/80cc9fa6ff|2.3.10| |[Fix][Connector-V2] User selects csv string pattern (#8572)|https://github.com/apache/seatunnel/commit/227a11f5aa|2.3.10| |[Fix][Connector-V2] Fix CSV String type write type (#8499)|https://github.com/apache/seatunnel/commit/9268f5a255|2.3.10| |[Hotfix][Connector-V2][SFTP] Add quote to sftp file names with wildcard characters (#8501)|https://github.com/apache/seatunnel/commit/c5751b001b|2.3.10| |[Fix][File] Fix Multi-file with binary format synchronization failed (#8546)|https://github.com/apache/seatunnel/commit/6e4ee468a5|2.3.10| |[Feature][Connector-V2] Support create emtpy file when no data (#8543)|https://github.com/apache/seatunnel/commit/275db78918|2.3.10| |[Feature][Connector-V2] Support single file mode in file sink (#8518)|https://github.com/apache/seatunnel/commit/e893deed50|2.3.10| |[Improve][Connector-file-base] Improved file allocation algorithm for subtasks. (#8453)|https://github.com/apache/seatunnel/commit/d61cba233e|2.3.9| |[Bug] [connector-file] When the data source field is less than the target (Hive) field,it will throw null pointer exception#8150 (#8200)|https://github.com/apache/seatunnel/commit/25b8a02b76|2.3.9| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Improve][Connector][Hive] skip temporary hidden directories (#8402)|https://github.com/apache/seatunnel/commit/9fdedc487e|2.3.9| |[Feature][Connector-V2] Support use EasyExcel as read excel engine (#8064)|https://github.com/apache/seatunnel/commit/b8e1177fcb|2.3.9| |[BugFix][Excel] Fix read formulas/number cell value of excel (#8316)|https://github.com/apache/seatunnel/commit/00c5aed1af|2.3.9| |[Improve][Connector-V2] Add some debug log when create dir in (S)FTP (#8286)|https://github.com/apache/seatunnel/commit/8687bb8e91|2.3.9| |[Improve][Transform] gz support excel (#8181)|https://github.com/apache/seatunnel/commit/c3ae726ee0|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][Excel] Support read blank string & auto type-cast (#8111)|https://github.com/apache/seatunnel/commit/3a54f1253f|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Connectors] LocalFile Support reading gz (#8025)|https://github.com/apache/seatunnel/commit/337aa50f08|2.3.9| |[Hotfix][Zeta] Fix the dependency conflict between the guava in hadoop-aws and hive-exec (#7986)|https://github.com/apache/seatunnel/commit/a7837f1f19|2.3.9| |[Fix][Connector-V2] Fix file binary format sync convert directory to file (#7942)|https://github.com/apache/seatunnel/commit/86ae9272c4|2.3.9| |[Fix][Connector-V2][FTP] Fix FTP connector connection_mode is not effective (#7865)|https://github.com/apache/seatunnel/commit/26c528a5ed|2.3.9| |[Fix][Connector-V2][connector-file-base-hadoop] Fixed HdfsFile source load the krb5_path configuration (#7870)|https://github.com/apache/seatunnel/commit/cd9836bced|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Connector-V2]Sftp file source support multiple table (#7824)|https://github.com/apache/seatunnel/commit/cfb8760f58|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Bug] [connectors-v2] The Hadoop Source/Sink fails with Unable to find valid Kerberos Ticket. (#7809)|https://github.com/apache/seatunnel/commit/a8bdea24cc|2.3.9| |[Fix][Connector-V2] Fix When reading Excel data, string and date type conversion errors (#7796)|https://github.com/apache/seatunnel/commit/749b2fe364|2.3.9| |[Feature][Connector-V2]Ftp file source support multiple table (#7795)|https://github.com/apache/seatunnel/commit/22fe27a3d6|2.3.9| |[Feature][Connector-V2] sftp file sink suport multiple table and save mode (#7668)|https://github.com/apache/seatunnel/commit/dc4b9898f7|2.3.8| |[Improve][Connector-V2] Support read archive compress file (#7633)|https://github.com/apache/seatunnel/commit/3f98cd8a16|2.3.8| |[Feature][Connector-V2] Ftp file sink suport multiple table and save mode (#7665)|https://github.com/apache/seatunnel/commit/4f812e12ae|2.3.8| |[Improve] Refactor S3FileCatalog and it's factory (#7457)|https://github.com/apache/seatunnel/commit/d928e8b113|2.3.8| |[Improve] Added OSSFileCatalog and it's factory (#7458)|https://github.com/apache/seatunnel/commit/9006a205db|2.3.8| |[Feature][Connector-V2][Iceberg] Support Iceberg Kerberos (#7246)|https://github.com/apache/seatunnel/commit/e3001207c8|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[feature][connector-file-local] add save mode function for localfile (#7080)|https://github.com/apache/seatunnel/commit/7b2f538310|2.3.6| |[Hotfix][Hive Connector] Fix Hive hdfs-site.xml and hive-site.xml not be load error (#7069)|https://github.com/apache/seatunnel/commit/c23a577f34|2.3.6| |[Feature][Connector-V2] Add Huawei Cloud OBS connector (#4578)|https://github.com/apache/seatunnel/commit/d266f4db64|2.3.6| |[Improve][File Connector]Improve xml read code & fix can not use true for a boolean option (#6930)|https://github.com/apache/seatunnel/commit/c13a563994|2.3.6| |[Improve][Files] Support write fixed/timestamp as int96 of parquet (#6971)|https://github.com/apache/seatunnel/commit/1a48a9c493|2.3.6| |[Feature][Connector-V2] Supports the transfer of any file (#6826)|https://github.com/apache/seatunnel/commit/c1401787b3|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[bigfix][S3 File]:Change the [SCHEMA] attribute of the [S3CONF class] to be non-static to avoid being reassigned after deserialization (#6717)|https://github.com/apache/seatunnel/commit/79bb70101a|2.3.6| |[Improve] Improve read with parquet type convert error (#6683)|https://github.com/apache/seatunnel/commit/6c65805699|2.3.5| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Feature][Tool] Add connector check script for issue 6199 (#6635)|https://github.com/apache/seatunnel/commit/65aedf6a79|2.3.5| |[Bug] Fix OrcWriteStrategy/ParquetWriteStrategy doesn't login with kerberos (#6472)|https://github.com/apache/seatunnel/commit/24441c876d|2.3.5| |[Bug] [formats] Fix fail to parse line when content contains the file delimiter (#6589)|https://github.com/apache/seatunnel/commit/17e29185fa|2.3.5| |[Improve][Connector-V2] Support read orc with schema config to cast type (#6531)|https://github.com/apache/seatunnel/commit/d1599f8ad9|2.3.5| |[Chore] Fix `file` spell errors (#6606)|https://github.com/apache/seatunnel/commit/2599d3b736|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Feature][Connectors-V2][File]support assign encoding for file source/sink (#6489)|https://github.com/apache/seatunnel/commit/d159fbe086|2.3.5| |Add support for XML file type to various file connectors such as SFTP, FTP, LocalFile, HdfsFile, and more. (#6327)|https://github.com/apache/seatunnel/commit/ec533ecd9a|2.3.5| |[BugFix][Connector-file-sftp] Fix SFTPInputStream.close does not correctly trigger the closing of the file stream (#6323) (#6329)|https://github.com/apache/seatunnel/commit/eee881af91|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |Fix HiveMetaStoreProxy#enableKerberos will return true if doesn't enable kerberos (#6307)|https://github.com/apache/seatunnel/commit/1dad6f7061|2.3.4| |[Feature][Connector]add s3file save mode function (#6131)|https://github.com/apache/seatunnel/commit/81c51073bf|2.3.4| |[bugfix][file-execl] Fix the Issue of Abnormal Data Reading from Excel Files (#5932)|https://github.com/apache/seatunnel/commit/6a2b05a845|2.3.4| |[Feature][Connectors-v2-file-ftp] FTP source/sink add ftp connection mode (#6077) (#6099)|https://github.com/apache/seatunnel/commit/f6bcc4d59d|2.3.4| |Disable HDFSFileSystem cache (#6039)|https://github.com/apache/seatunnel/commit/135c91818e|2.3.4| |[Feature][OssFile Connector] Make Oss implement source factory and sink factory (#6062)|https://github.com/apache/seatunnel/commit/1a8e9b4554|2.3.4| |[Improve][Common] Adapt `FILE_OPERATION_FAILED` to `CommonError` (#5928)|https://github.com/apache/seatunnel/commit/b3dc0bbc21|2.3.4| |[Feature][Connector-V2] Support read .xls excel file (#6066)|https://github.com/apache/seatunnel/commit/43787a3dde|2.3.4| |Add multiple table file sink to base (#6049)|https://github.com/apache/seatunnel/commit/085e0e5fc3|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |[Hotfix][Oss File Connector] fix oss connector can not run bug (#6010)|https://github.com/apache/seatunnel/commit/755bc2a730|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Feature] LocalFile sink support multiple table (#5931)|https://github.com/apache/seatunnel/commit/0fdf45f94d|2.3.4| |[Improve][File] Clean memory buffer of `JsonWriteStrategy` & `ExcelWriteStrategy` (#5925)|https://github.com/apache/seatunnel/commit/7297a4c95c|2.3.4| |[Bug][Connector][FileBase]Parquet reader parsing array type exception. (#4457)|https://github.com/apache/seatunnel/commit/5c6b11329c|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Feature] LocalFileSource support multiple table|https://github.com/apache/seatunnel/commit/72be6663ad|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][connector-file] unifiy option between file source/sink and update document (#5680)|https://github.com/apache/seatunnel/commit/8d87cf8fc4|2.3.4| |[Improve][LocalFile] parquet use system timezone (#5605)|https://github.com/apache/seatunnel/commit/b3e13513ac|2.3.4| |[Bugfix][Connector-v2] fix file sink `isPartitionFieldWriteInFile` occurred exception when no columns are given (#5508)|https://github.com/apache/seatunnel/commit/9fb5499295|2.3.4| |[Feature] Support `LZO` compress on File Read (#5083)|https://github.com/apache/seatunnel/commit/a4a1901096|2.3.4| |[Feature][Connector-V2][File] Support read empty directory (#5591)|https://github.com/apache/seatunnel/commit/1f58f224a0|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature] [File Connector]optionrule FILE_FORMAT_TYPE is text/csv ,add parameter BaseSinkConfig.ENABLE_HEADER_WRITE: #5566 (#5567)|https://github.com/apache/seatunnel/commit/0e02db768d|2.3.4| |[Hotfix][File-Connector] Fix WriteStrategy parallel writing thread unsafe issue (#5546)|https://github.com/apache/seatunnel/commit/1177d02d55|2.3.4| |[Bugfix][jindo] Remove useless code (#5540)|https://github.com/apache/seatunnel/commit/b889618379|2.3.4| |[Feature] [File Connector] Supports writing column names when the output type is file (CSV) (#5459)|https://github.com/apache/seatunnel/commit/f73b37291e|2.3.4| |[bugfix][CI]remove jindo dependencies|https://github.com/apache/seatunnel/commit/38e1e30e20|2.3.4| |[Feature][Connector-V2][Oss jindo] Fix the problem of jindo driver download failure. (#5511)|https://github.com/apache/seatunnel/commit/a14d9c0d08|2.3.4| |Revert "[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)" (#5487)|https://github.com/apache/seatunnel/commit/093901068e|2.3.4| |[fix][hive-source][bug] fix An error occurred reading an empty directory (#5427)|https://github.com/apache/seatunnel/commit/de7b86a5dd|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153)|https://github.com/apache/seatunnel/commit/a3c13e59eb|2.3.3| |[bugfix] [File Base] Fix Hadoop Kerberos authentication related issues. (#5171)|https://github.com/apache/seatunnel/commit/2a85525f4c|2.3.3| |[Feature][Connector-V2][File] Add cos source&sink (#4979)|https://github.com/apache/seatunnel/commit/1f94676436|2.3.3| |[Improve][Connector[File] Optimize files commit order (#5045)|https://github.com/apache/seatunnel/commit/1e18a8c530|2.3.3| |[Improve][Connector-V2][OSS-Jindo] Optimize jindo oss connector (#4964)|https://github.com/apache/seatunnel/commit/5fbfd05061|2.3.3| |[Feature][E2E][FtpFile] add ftp file e2e test case (#4647)|https://github.com/apache/seatunnel/commit/b1b1f5e7e0|2.3.3| |[Bugfix] [Connector-V2] [File] Fix read temp file (#4876)|https://github.com/apache/seatunnel/commit/5e03d22d6c|2.3.2| |[Bug Fix] [seatunnel-connectors-v2][SFTP] Fix incorrect exception handling logic (#4720)|https://github.com/apache/seatunnel/commit/dc350e67c3|2.3.2| |[Fix][Connector-V2] Fix file-oss config check bug and amend file-oss-jindo factoryIdentifier (#4581)|https://github.com/apache/seatunnel/commit/5c4f17df20|2.3.2| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| | [Feature][ConnectorV2]add file excel sink and source (#4164)|https://github.com/apache/seatunnel/commit/e3b97ae5d2|2.3.2| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[Chore] Upgrade guava to 27.0-jre (#4238)|https://github.com/apache/seatunnel/commit/4851bee575|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add S3Catalog (#4121)|https://github.com/apache/seatunnel/commit/7d7f506547|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Hive] Support assign partitions (#3842)|https://github.com/apache/seatunnel/commit/6a4a850b4c|2.3.1| |[Bug][Connectors] Text And Json WriteStrategy lost the sinkColumnsIndexInRow (#3863)|https://github.com/apache/seatunnel/commit/7b5f6f1bc2|2.3.1| |[Feature][Connector-V2][File] Support compress (#3899)|https://github.com/apache/seatunnel/commit/55602f6b1c|2.3.1| |[Feature][Connector-V2][File] Allow the user to set the row delimiter as an empty string (#3854)|https://github.com/apache/seatunnel/commit/84508fcb65|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Feature][Connector-V2][File] Support skip number when reading text csv files (#3900)|https://github.com/apache/seatunnel/commit/243b6a6b23|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2][File] Improve file connector option rule and document (#3812)|https://github.com/apache/seatunnel/commit/bd76077669|2.3.1| |[Improve][Connector-V2][File] File Connector add lzo compression way. (#3782)|https://github.com/apache/seatunnel/commit/8875d02589|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |fix file source connector option rule bug (#3804)|https://github.com/apache/seatunnel/commit/cab42f6eb1|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Improve][Connector-V2][HDFS] Support setting hdfs-site.xml (#3778)|https://github.com/apache/seatunnel/commit/c8d59ecac1|2.3.0| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Improve] [Connector-V2] Fix Kafka sink can't run EXACTLY_ONCE semantics (#3724)|https://github.com/apache/seatunnel/commit/5e3f196e29|2.3.0| |[Connector-V2] [File] Fix bug data file name will duplicate when use SeaTunnel Engine (#3717)|https://github.com/apache/seatunnel/commit/c96c53004f|2.3.0| |[Engine][Checkpoint]Unified naming style (#3714)|https://github.com/apache/seatunnel/commit/bc0bd3bec3|2.3.0| |[Connector][File-S3]Set AK is not required (#3713)|https://github.com/apache/seatunnel/commit/da3c526172|2.3.0| |[Hotfix][Connector-V2][File] Fix file sink connector npe (#3706)|https://github.com/apache/seatunnel/commit/a662a88fdc|2.3.0| |[Connector&Engine]Set S3 AK to optional (#3688)|https://github.com/apache/seatunnel/commit/4710918b02|2.3.0| |[Hotfix][OssFile Connector]fix ossfile bug (#3684)|https://github.com/apache/seatunnel/commit/ba6259274d|2.3.0| |[Feature][Connector-V2][Oss jindo] Add oss jindo source & sink connector (#3456)|https://github.com/apache/seatunnel/commit/2507372311|2.3.0| |[Improve][Connector-V2][File] Support split file based on batch size (#3625)|https://github.com/apache/seatunnel/commit/f39e3a531d|2.3.0| |[Connector][S3]Support s3a protocol (#3632)|https://github.com/apache/seatunnel/commit/ae4cc9c1ec|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][File] Unified excetion for file source & sink connectors (#3525)|https://github.com/apache/seatunnel/commit/031e8e263c|2.3.0| |[Hotfix][Connector-V2][Hive] Fix npe of getting file system (#3506)|https://github.com/apache/seatunnel/commit/e1fc3d1b01|2.3.0| |[Improve][core-v1][seatunnel-core-base] remove seatunnel-core-base (#3480)|https://github.com/apache/seatunnel/commit/d6e6a02a36|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Improve][Connector-V2][File] Improve code structure (#3238)|https://github.com/apache/seatunnel/commit/dd5c353881|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Hotfix][Connector-V2][Hive] Fix the bug that when write data to hive throws NullPointerException (#3258)|https://github.com/apache/seatunnel/commit/777bf6b42e|2.3.0| |[Core] [Improve] Fix some sonar check error (#3240)|https://github.com/apache/seatunnel/commit/8664bb53a5|2.3.0| |[Bug]add 3node worker done test and fix some bug (#3115)|https://github.com/apache/seatunnel/commit/bc852a4dff|2.3.0| |[Feature][Connector-V2][SFTP] Add SFTP file source & sink connector (#3006)|https://github.com/apache/seatunnel/commit/9e496383b8|2.3.0| |[Feature][Connector-V2][S3] Add S3 file source & sink connector (#3119)|https://github.com/apache/seatunnel/commit/f27d68ca9c|2.3.0-beta| |[Feature][Connector-V2][File] Fix filesystem get error (#3117)|https://github.com/apache/seatunnel/commit/7404c180de|2.3.0-beta| |[Improve][Connector-v2][file] Reuse array type container when read row data (#3123)|https://github.com/apache/seatunnel/commit/da0646ac6d|2.3.0-beta| |[Hotfix][Connector-V2][File] Fix ParquetReadStrategy get NPE (#3122)|https://github.com/apache/seatunnel/commit/ba99de08c8|2.3.0-beta| |[hotfix][engine] Add master node switch test and fix bug (#3082)|https://github.com/apache/seatunnel/commit/608be51bc4|2.3.0-beta| |[Improve][Connector-V2][File] Support parse field from file path (#2985)|https://github.com/apache/seatunnel/commit/0bc12085c2|2.3.0-beta| |[hotfix][connector][file] Solved the bug of can not parse '\t' as delimiter from config file (#3083)|https://github.com/apache/seatunnel/commit/bfde596754|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Improve][Connector-V2] Improve text write (#2971)|https://github.com/apache/seatunnel/commit/0ecd7906c2|2.3.0-beta| |[Improve][connector][file] Support user-defined schema for reading text file (#2976)|https://github.com/apache/seatunnel/commit/1c05ee0d7e|2.3.0-beta| |[Bug][Connector-V2][File] Fix the bug of incorrect path in windows environment (#2980)|https://github.com/apache/seatunnel/commit/2e16161865|2.3.0-beta| |[Improve][Connector] Improve write parquet (#2943)|https://github.com/apache/seatunnel/commit/8fd966394b|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][connector-file-base] Fix source split assigning reader to negative number (#2921)|https://github.com/apache/seatunnel/commit/0b5a2852fb|2.3.0-beta| |[Improve][Connector-V2] Improve orc write strategy to support all data types (#2860)|https://github.com/apache/seatunnel/commit/4d048cc23e|2.3.0-beta| |[Fix] [Connector-V2-File] Fix file connector bug (#2858)|https://github.com/apache/seatunnel/commit/e0459bbab6|2.2.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Improve][Connector-V2] Improve read parquet (#2841)|https://github.com/apache/seatunnel/commit/e19bc82f9b|2.2.0-beta| |[Imporve][Connector-V2] Refactor ftp sink & Add ftp file source (#2774)|https://github.com/apache/seatunnel/commit/4aacbcdd1f|2.2.0-beta| |[Bug] [Connector-V2] Fix hive source connector parallelism not work (#2823)|https://github.com/apache/seatunnel/commit/9f21d4c769|2.2.0-beta| |[Improve][Connector-V2] Imporve orc read strategy (#2747)|https://github.com/apache/seatunnel/commit/af34beda37|2.2.0-beta| |[Bug][Connector-V2] Fix error option (#2775)|https://github.com/apache/seatunnel/commit/488e561eef|2.2.0-beta| |[Improve][Connector-V2] Refactor hdfs file sink connector code structure (#2701)|https://github.com/apache/seatunnel/commit/6129c02567|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[Improve][Connector-V2] Refactor local file sink connector code structure (#2655)|https://github.com/apache/seatunnel/commit/6befd599a1|2.2.0-beta| |[Feature][Connector-V2] Add oss sink (#2629)|https://github.com/apache/seatunnel/commit/bb2ad40487|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Improve][Connector-V2] Refactor the structure of file sink to reduce redundant codes (#2555)|https://github.com/apache/seatunnel/commit/6315092930|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Feature][Connector-V2] Add oss source connector (#2467)|https://github.com/apache/seatunnel/commit/712b77744e|2.2.0-beta| |[Feature][File connector] Support ftp file sink (#2483)|https://github.com/apache/seatunnel/commit/a87e5de80a|2.2.0-beta| |[Feature][Connector-V2] Local file json support (#2465)|https://github.com/apache/seatunnel/commit/65a92f2496|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file json support (#2451)|https://github.com/apache/seatunnel/commit/84f6b17c15|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add hdfs file source connector (#2420)|https://github.com/apache/seatunnel/commit/4fb6f2a216|2.2.0-beta| |[Feature][Connector-V2] Add local file connector source (#2419)|https://github.com/apache/seatunnel/commit/eff595c452|2.2.0-beta| |[Feature][Connector-V2] Add base source connector code for connector-file-base (#2399)|https://github.com/apache/seatunnel/commit/1829ddc662|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of local file connector (#2403)|https://github.com/apache/seatunnel/commit/a538daed5c|2.2.0-beta| |[Feature][Connector-V2] Add json file sink & json format (#2385)|https://github.com/apache/seatunnel/commit/dd68c06b0a|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that file connector release resources multi times (#2379)|https://github.com/apache/seatunnel/commit/58c64aab2a|2.2.0-beta| |[Improve][Connector-V2] Optimize the code structure (#2380)|https://github.com/apache/seatunnel/commit/7376ec7ab1|2.2.0-beta| |[Imporve][Connector-V2] Remove redundant type judge logic because of pr #2315 (#2370)|https://github.com/apache/seatunnel/commit/42e8c25e50|2.2.0-beta| |[Feature][Connector-V2] Support orc file format in file connector (#2369)|https://github.com/apache/seatunnel/commit/f44fe1e033|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |Replace plain string with constants (#2308)|https://github.com/apache/seatunnel/commit/3c0415e56e|2.2.0-beta| |[Connector-V2] Add parquet writer in file connector (#2273)|https://github.com/apache/seatunnel/commit/c95cc72cfa|2.2.0-beta| |[checkstyle] Improved validation scope of MagicNumber (#2194)|https://github.com/apache/seatunnel/commit/6d08b5f369|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-fluss.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-google-firestore.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-google-sheets.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] google sheets options (#8922)|https://github.com/apache/seatunnel/commit/48ede612dc|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][Connector-V2] Replace CommonErrorCodeDeprecated.JSON_OPERATION_FAILED (#5978)|https://github.com/apache/seatunnel/commit/456cd17714|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][GoogleSheets] Unified exception for GoogleSheets source connector (#3524)|https://github.com/apache/seatunnel/commit/eb42d629ad|2.3.0| |[Feature][Connector-V2][Google Sheets] Add Google Sheets option rules (#3364)|https://github.com/apache/seatunnel/commit/da33f730ca|2.3.0| |fix: schema get error (#3361)|https://github.com/apache/seatunnel/commit/fdaa85ed24|2.3.0| |[Feature][Connector-V2][GoogleSheets] Support GoogleSheets Source (#3185)|https://github.com/apache/seatunnel/commit/60ecc6428b|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-graphql.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-hbase.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-hive.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][File] Add markdown parser #9714|https://github.com/apache/seatunnel/commit/8b3c07844| dev | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature][connector-hive] hive sink connector support overwrite mode #7843 (#7891)|https://github.com/apache/seatunnel/commit/6fafe6f4d3|2.3.12| |[Fix][Connector-V2] Fix hive client thread unsafe (#9282)|https://github.com/apache/seatunnel/commit/5dc25897a9|2.3.11| |[improve] update file connectors config (#9034)|https://github.com/apache/seatunnel/commit/8041d59dc2|2.3.11| |[Improve] Refactor file enumerator to prevent duplicate put split (#8989)|https://github.com/apache/seatunnel/commit/fdf1beae9c|2.3.11| |Revert " [improve] update localfile connector config" (#9018)|https://github.com/apache/seatunnel/commit/cdc79e13ad|2.3.10| | [improve] update localfile connector config (#8765)|https://github.com/apache/seatunnel/commit/def369a85f|2.3.10| |[Improve][connector-hive] Improved hive file allocation algorithm for subtasks (#8876)|https://github.com/apache/seatunnel/commit/89d1878ade|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Hive] Writing parquet files supports the optional timestamp int96 (#8509)|https://github.com/apache/seatunnel/commit/856aea1952|2.3.10| |[Fix] Set all snappy dependency use one version (#8423)|https://github.com/apache/seatunnel/commit/3ac977c8d3|2.3.9| |[Fix][Connector-V2] Fix hive krb5 path not work (#8228)|https://github.com/apache/seatunnel/commit/e18a4d07b4|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][File] Support config null format for text file read (#8109)|https://github.com/apache/seatunnel/commit/2dbf02df47|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][E2E] Add hive3 e2e test case (#8003)|https://github.com/apache/seatunnel/commit/9a24fac2c4|2.3.9| |[Improve][Connector-V2] Change File Read/WriteStrategy `setSeaTunnelRowTypeInfo` to `setCatalogTable` (#7829)|https://github.com/apache/seatunnel/commit/6b5f74e524|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Zeta] Split the classloader of task group (#7580)|https://github.com/apache/seatunnel/commit/3be0d1cc61|2.3.8| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Hive] Close resources when exception occurs (#7205)|https://github.com/apache/seatunnel/commit/561171528b|2.3.6| |[Hotfix][Hive Connector] Fix Hive hdfs-site.xml and hive-site.xml not be load error (#7069)|https://github.com/apache/seatunnel/commit/c23a577f34|2.3.6| |Fix hive load hive_site_path and hdfs_site_path too late (#7017)|https://github.com/apache/seatunnel/commit/e2578a5b4d|2.3.6| |[Bug] [connector-hive] Eanble login with kerberos for hive (#6893)|https://github.com/apache/seatunnel/commit/26e433e472|2.3.6| |[Feature][S3 File] Make S3 File Connector support multiple table write (#6698)|https://github.com/apache/seatunnel/commit/8f2049b2f1|2.3.6| |[Feature] Hive Source/Sink support multiple table (#5929)|https://github.com/apache/seatunnel/commit/4d9287fce4|2.3.6| |[Improve][Hive] udpate hive3 version (#6699)|https://github.com/apache/seatunnel/commit/1184c05c29|2.3.6| |[HiveSink]Fix the risk of resource leakage. (#6721)|https://github.com/apache/seatunnel/commit/c23804f13b|2.3.6| |[Improve][Connector-v2] The hive connector support multiple filesystem (#6648)|https://github.com/apache/seatunnel/commit/8a4c01fe35|2.3.6| |[Fix][Connector-V2] Fix add hive partition error when partition already existed (#6577)|https://github.com/apache/seatunnel/commit/2a0a0b9d19|2.3.5| |Fix HiveMetaStoreProxy#enableKerberos will return true if doesn't enable kerberos (#6307)|https://github.com/apache/seatunnel/commit/1dad6f7061|2.3.4| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Refactor][File Connector] Put Multiple Table File API to File Base Module (#6033)|https://github.com/apache/seatunnel/commit/c324d663b4|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Hotfix][Connector-V2][Hive] fix the bug that hive-site.xml can not be injected in HiveConf (#5261)|https://github.com/apache/seatunnel/commit/04ce22ac1e|2.3.4| |[Improve][Connector-v2][HiveSink]remove drop partition when abort. (#4940)|https://github.com/apache/seatunnel/commit/edef87b523|2.3.3| |[feature][web] hive add option because web need (#5154)|https://github.com/apache/seatunnel/commit/5e1511ff0d|2.3.3| |[Hotfix][Connector-V2][Hive] Support user-defined hive-site.xml (#4965)|https://github.com/apache/seatunnel/commit/2a064bcdb0|2.3.3| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |[hotfix] fixed schema options import error|https://github.com/apache/seatunnel/commit/656805f2df|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Hotfix][Connector-V2][Hive] Fix hive unknownhost (#4141)|https://github.com/apache/seatunnel/commit/f1a1dfe4af|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Hive] Support assign partitions (#3842)|https://github.com/apache/seatunnel/commit/6a4a850b4c|2.3.1| |[Improve][Connector-V2][Hive] Improve config check logic (#3886)|https://github.com/apache/seatunnel/commit/b4348f6f44|2.3.1| |[Feature][Connector-V2] Support kerberos in hive and hdfs file connector (#3840)|https://github.com/apache/seatunnel/commit/055ad9d836|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve][Connector-V2] The log outputs detailed exception stack information (#3805)|https://github.com/apache/seatunnel/commit/d0c6217f27|2.3.1| |[Feature][Shade] Add seatunnel hadoop3 uber (#3755)|https://github.com/apache/seatunnel/commit/5a024bdf8f|2.3.0| |[Feature][Connector-V2][File] Optimize filesystem utils (#3749)|https://github.com/apache/seatunnel/commit/ac4e880fb5|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Hotfix][Connector-V2][Hive] Fix npe of getting file system (#3506)|https://github.com/apache/seatunnel/commit/e1fc3d1b01|2.3.0| |[Improve][Connector-V2][Hive] Unified exceptions for hive source & sink connector (#3541)|https://github.com/apache/seatunnel/commit/12c0fb91d2|2.3.0| |[Feature][Connector-V2][File] Add option and factory for file connectors (#3375)|https://github.com/apache/seatunnel/commit/db286e8631|2.3.0| |[Hotfix][Connector-V2][Hive] Fix the bug that when write data to hive throws NullPointerException (#3258)|https://github.com/apache/seatunnel/commit/777bf6b42e|2.3.0| |[Improve][Connector-V2][Hive] Hive Sink Support msck partitions (#3133)|https://github.com/apache/seatunnel/commit/a8738ef3c4|2.3.0-beta| |unify `flatten-maven-plugin` version (#3078)|https://github.com/apache/seatunnel/commit/ed743fddcc|2.3.0-beta| |[Engine][Merge] fix merge problem|https://github.com/apache/seatunnel/commit/0e9ceeefc9|2.3.0-beta| |Merge remote-tracking branch 'upstream/dev' into st-engine|https://github.com/apache/seatunnel/commit/ca80df779a|2.3.0-beta| |update hive.metastore.version to hive.exec.version (#2879)|https://github.com/apache/seatunnel/commit/018ee0a3db|2.2.0-beta| |[Bug][Connector-V2] Fix hive sink bug (#2870)|https://github.com/apache/seatunnel/commit/d661fa011e|2.2.0-beta| |[Fix][Connector-V2] Fix HiveSource Connector read orc table error (#2845)|https://github.com/apache/seatunnel/commit/61720306e7|2.2.0-beta| |[Bug][Connector-V2] Fix hive source text table name (#2797)|https://github.com/apache/seatunnel/commit/563637ebd1|2.2.0-beta| |[Improve][Connector-V2] Refactor hive source & sink connector (#2708)|https://github.com/apache/seatunnel/commit/a357dca365|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706) (#2731)|https://github.com/apache/seatunnel/commit/e8929ab605|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Improve][Connector-V2] Refactor the package of hdfs file connector (#2402)|https://github.com/apache/seatunnel/commit/87d0624c5b|2.2.0-beta| |[Feature][Connector-V2] Add orc file support in connector hive sink (#2311) (#2374)|https://github.com/apache/seatunnel/commit/81cb80c050|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |Decide table format using outputFormat in HiveSinkConfig #2303|https://github.com/apache/seatunnel/commit/3a2586f6dc|2.2.0-beta| |[Feature][Connector-V2-Hive] Add parquet file format support to Hive Sink (#2310)|https://github.com/apache/seatunnel/commit/4ab3c21b8d|2.2.0-beta| |Add BaseHiveCommitInfo for common hive commit info (#2306)|https://github.com/apache/seatunnel/commit/0d2f6f4d7c|2.2.0-beta| |Remove same code to independent method in HiveSinkWriter (#2307)|https://github.com/apache/seatunnel/commit/e99e6ee726|2.2.0-beta| |Avoid potential null pointer risk in HiveSinkWriter#snapshotState (#2302)|https://github.com/apache/seatunnel/commit/e7d817f7d2|2.2.0-beta| |[Connector-V2] Add file type check logic in hive connector (#2275)|https://github.com/apache/seatunnel/commit/5488337c67|2.2.0-beta| |[Connector-V2] Add parquet file reader for Hive Source Connector (#2199) (#2237)|https://github.com/apache/seatunnel/commit/59db97ed34|2.2.0-beta| |Merge from dev to st-engine (#2243)|https://github.com/apache/seatunnel/commit/41e530afd5|2.3.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[Bug][connector-hive] filter '_SUCCESS' file in file list (#2235) (#2236)|https://github.com/apache/seatunnel/commit/db04651523|2.2.0-beta| |[Bug][hive-connector-v2] Resolve the schema inconsistency bug (#2229) (#2230)|https://github.com/apache/seatunnel/commit/62ca075915|2.2.0-beta| |[Bug][spark-connector-v2-example] fix the bug of no class found. (#2191) (#2192)|https://github.com/apache/seatunnel/commit/5dbc2df17e|2.2.0-beta| |[Connector-V2] Add Hive sink connector v2 (#2158)|https://github.com/apache/seatunnel/commit/23ad4ee735|2.2.0-beta| |[Connector-V2] Add File Sink Connector (#2117)|https://github.com/apache/seatunnel/commit/e2283da64f|2.2.0-beta| |[Connector-V2]Hive Source (#2123)|https://github.com/apache/seatunnel/commit/ffcf3f59e2|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-airtable.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- |
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-base.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connectors-v2] Fix UT for connector-http (#9821)|https://github.com/apache/seatunnel/commit/2653f6798e| dev | |[Fix][connector-http] fix parsing httpjson, the number of two fields is inconsistent with the import failure (#9103)|https://github.com/apache/seatunnel/commit/c8ade098ee|2.3.12| |[Fix][Connector-HTTP] Add default content-type when user not set (#9497)|https://github.com/apache/seatunnel/commit/8da0a78c1d|2.3.12| |[Bug][connector-http] Fix paging request running infinitely (#9504)|https://github.com/apache/seatunnel/commit/1844e04c97|2.3.12| |[Bug] [seatunnel-connector-http-base] An NPE (NullPointerException) will occur when the pageField is null (#9498)|https://github.com/apache/seatunnel/commit/b898a3225c|2.3.12| |[Fix][Connector-Http] fix Invalid mime type (#9363)|https://github.com/apache/seatunnel/commit/4d7d765a26|2.3.12| |[Feature][http-Sink] Implementing http batch writes (#9292)|https://github.com/apache/seatunnel/commit/04ee8aca04|2.3.11| |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[Improve][Connector-V2][Http] Supports Cursor-based Pagination (#9109) (#9138)|https://github.com/apache/seatunnel/commit/879b1e2d5b|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Add prometheus source and sink (#7265)|https://github.com/apache/seatunnel/commit/dde6f9fcbd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix http source can not read streaming (#7703)|https://github.com/apache/seatunnel/commit/a0ffa7ba02|2.3.8| |[Feature][Connector-V2] Suport choose the start page in http paging (#7180)|https://github.com/apache/seatunnel/commit/ed15f0dcf9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |Fix HttpSource bug (#6824)|https://github.com/apache/seatunnel/commit/c3ab84caa4|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Improve][Connector-V2]Support multi-table sink feature for httpsink (#6316)|https://github.com/apache/seatunnel/commit/e6c51a95c7|2.3.5| |[Improve][HttpConnector]Increase custom configuration timeout. (#6223)|https://github.com/apache/seatunnel/commit/fa5b7d3d83|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[BUG][Connector-V2][Http] fix bug http config no schema option and improve e2e test add case (#5939)|https://github.com/apache/seatunnel/commit/8a71b9e072|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Transform] add JsonPath transform (#5632)|https://github.com/apache/seatunnel/commit/d908f0af40|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector-V2] HTTP supports page increase #5477 (#5561)|https://github.com/apache/seatunnel/commit/bb180b2988|2.3.4| |[improve][Connector-V2][http] improve http e2e test (#5655)|https://github.com/apache/seatunnel/commit/f5867adcaa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[BUG][Connector-V2][http] fix httpheader cover (#5446)|https://github.com/apache/seatunnel/commit/cdd8e0a65e|2.3.4| |[Feature][Connector][Http] Support multi-line text splits (#4698)|https://github.com/apache/seatunnel/commit/6a524981cb|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix] [seatunnel-connectors-v2] [connector-http] fix http json request error (#3629)|https://github.com/apache/seatunnel/commit/54f594d6ca|2.3.0| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Improve][Connector-V2][Http]Unified exception for http source & sink… (#3594)|https://github.com/apache/seatunnel/commit/d798cd8670|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][format][json] Fix jackson package conflict with spark (#2934)|https://github.com/apache/seatunnel/commit/1a92b8369b|2.3.0-beta| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| |[Improve][Connector-V2] Improve http connector (#2833)|https://github.com/apache/seatunnel/commit/5b3957bc52|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that set params by mistake (#2511) (#2513)|https://github.com/apache/seatunnel/commit/ead3d68b0e|2.2.0-beta| |[Improve][Connector-V2] Http source support user-defined schema (#2439)|https://github.com/apache/seatunnel/commit/793933b6b8|2.2.0-beta| |[Improve][Connector-V2] Format SeaTunnelRow use seatunnel-format-json (#2435)|https://github.com/apache/seatunnel/commit/e4e8f7fbff|2.2.0-beta| |[Improve][Connector-V2] Make the attribute of http-connector from private to protected (#2418)|https://github.com/apache/seatunnel/commit/f3b00ef696|2.2.0-beta| |[Feature][Connector-V2] Add feishu sink (#2381)|https://github.com/apache/seatunnel/commit/0fec8ca438|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-feishu.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-http-github.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-http-gitlab.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Gitlab] Unified excetion for Gitlab connector and improve optione rule (#3533)|https://github.com/apache/seatunnel/commit/77f68f1eef|2.3.0| |[Feature][Connector V2] add gitlab source connector (#3408)|https://github.com/apache/seatunnel/commit/545595c6d2|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-jira.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Jira]Add Jira source connector (#3473)|https://github.com/apache/seatunnel/commit/fb40162c07|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-klaviyo.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Klaviyo]Unified exception for Klaviyo connector (#3555)|https://github.com/apache/seatunnel/commit/08f8615078|2.3.0| |[Feature][Connector-V2][Klaviyo]Add Klaviyo source connector (#3443)|https://github.com/apache/seatunnel/commit/fc00a2866b|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-lemlist.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Lemlist] Unified exception for lemlist connector (#3534)|https://github.com/apache/seatunnel/commit/705728ebbb|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-myhours.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MyHours]Unified exception for MyHours connector (#3538)|https://github.com/apache/seatunnel/commit/48ab7c97d5|2.3.0| |[HotFix][Core][API] Fix OptionValidation error code (#3439)|https://github.com/apache/seatunnel/commit/ace219f376|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-notion.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-http-onesignal.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Improve][Connector-V2][OneSignal]Unified exception for OneSignal connector (#3609)|https://github.com/apache/seatunnel/commit/97cce8c255|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][OneSignal]Add OneSignal source conector (#3454)|https://github.com/apache/seatunnel/commit/b318b3166f|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http-persistiq.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-http-wechat.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| | [Feature][Connector-V2] Add Enterprise Wechat sink connector (#2412)|https://github.com/apache/seatunnel/commit/3e200e0a38|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-http.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connectors-v2] Fix UT for connector-http (#9821)|https://github.com/apache/seatunnel/commit/2653f6798e| dev | |[Fix][connector-http] fix parsing httpjson, the number of two fields is inconsistent with the import failure (#9103)|https://github.com/apache/seatunnel/commit/c8ade098ee|2.3.12| |[Fix][Connector-HTTP] Add default content-type when user not set (#9497)|https://github.com/apache/seatunnel/commit/8da0a78c1d|2.3.12| |[Bug][connector-http] Fix paging request running infinitely (#9504)|https://github.com/apache/seatunnel/commit/1844e04c97|2.3.12| |[Bug] [seatunnel-connector-http-base] An NPE (NullPointerException) will occur when the pageField is null (#9498)|https://github.com/apache/seatunnel/commit/b898a3225c|2.3.12| |[Fix][Connector-Http] fix Invalid mime type (#9363)|https://github.com/apache/seatunnel/commit/4d7d765a26|2.3.12| |[Feature][http-Sink] Implementing http batch writes (#9292)|https://github.com/apache/seatunnel/commit/04ee8aca04|2.3.11| |[Feature][connector-http] Parameters support placeholder replacement (#9184)|https://github.com/apache/seatunnel/commit/8617014edc|2.3.11| |[Improve][Connector-V2][Http] Supports Cursor-based Pagination (#9109) (#9138)|https://github.com/apache/seatunnel/commit/879b1e2d5b|2.3.11| |[improve] http connector options (#8969)|https://github.com/apache/seatunnel/commit/63ff9f910a|2.3.10| |[Fix][connector-http] fix when post have param (#8434)|https://github.com/apache/seatunnel/commit/c1b2675ab0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-V2] Add prometheus source and sink (#7265)|https://github.com/apache/seatunnel/commit/dde6f9fcbd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix http source can not read streaming (#7703)|https://github.com/apache/seatunnel/commit/a0ffa7ba02|2.3.8| |[Feature][Connector-V2] Suport choose the start page in http paging (#7180)|https://github.com/apache/seatunnel/commit/ed15f0dcf9|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][CDC] Close idle subtasks gorup(reader/writer) in increment phase (#6526)|https://github.com/apache/seatunnel/commit/454c339b9c|2.3.6| |Fix HttpSource bug (#6824)|https://github.com/apache/seatunnel/commit/c3ab84caa4|2.3.6| |[Hotfix] fix http source can not read yyyy-MM-dd HH:mm:ss format bug & Improve DateTime Utils (#6601)|https://github.com/apache/seatunnel/commit/19888e7969|2.3.5| |[Improve][Connector-V2]Support multi-table sink feature for httpsink (#6316)|https://github.com/apache/seatunnel/commit/e6c51a95c7|2.3.5| |[Improve][HttpConnector]Increase custom configuration timeout. (#6223)|https://github.com/apache/seatunnel/commit/fa5b7d3d83|2.3.4| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[BUG][Connector-V2][Http] fix bug http config no schema option and improve e2e test add case (#5939)|https://github.com/apache/seatunnel/commit/8a71b9e072|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on http (#5816)|https://github.com/apache/seatunnel/commit/6f49ec6ead|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Transform] add JsonPath transform (#5632)|https://github.com/apache/seatunnel/commit/d908f0af40|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Connector-V2] HTTP supports page increase #5477 (#5561)|https://github.com/apache/seatunnel/commit/bb180b2988|2.3.4| |[improve][Connector-V2][http] improve http e2e test (#5655)|https://github.com/apache/seatunnel/commit/f5867adcaa|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[BUG][Connector-V2][http] fix httpheader cover (#5446)|https://github.com/apache/seatunnel/commit/cdd8e0a65e|2.3.4| |[Feature][Connector][Http] Support multi-line text splits (#4698)|https://github.com/apache/seatunnel/commit/6a524981cb|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Feature][Connector-V2][Github] Adding Github Source Connector (#4155)|https://github.com/apache/seatunnel/commit/49d9172b10|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-V2][Persistiq]Add Persistiq source connector (#3460)|https://github.com/apache/seatunnel/commit/aec3912edf|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][Connector-V2][Notion] Add Notion source connector (#3470)|https://github.com/apache/seatunnel/commit/46abc6d943|2.3.0| |[Hotfix] [seatunnel-connectors-v2] [connector-http] fix http json request error (#3629)|https://github.com/apache/seatunnel/commit/54f594d6ca|2.3.0| |[Improve][Connector-V2][Http]Improve json parse option rule for all http connector (#3627)|https://github.com/apache/seatunnel/commit/589e4161ec|2.3.0| |[Improve][Connector-V2][OneSignal]Unified exception for OneSignal connector (#3609)|https://github.com/apache/seatunnel/commit/97cce8c255|2.3.0| |[Feature][Connector-V2][HTTP] Use json-path parsing (#3510)|https://github.com/apache/seatunnel/commit/1807eb6c95|2.3.0| |[Improve][Connector-V2][Http]Unified exception for http source & sink… (#3594)|https://github.com/apache/seatunnel/commit/d798cd8670|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MyHours]Unified exception for MyHours connector (#3538)|https://github.com/apache/seatunnel/commit/48ab7c97d5|2.3.0| |[Improve][Connector-V2][Gitlab] Unified excetion for Gitlab connector and improve optione rule (#3533)|https://github.com/apache/seatunnel/commit/77f68f1eef|2.3.0| |[Improve][Connector-V2][Klaviyo]Unified exception for Klaviyo connector (#3555)|https://github.com/apache/seatunnel/commit/08f8615078|2.3.0| |[Feature][Connector-V2][Jira]Add Jira source connector (#3473)|https://github.com/apache/seatunnel/commit/fb40162c07|2.3.0| |[Improve][Connector-V2][Lemlist] Unified exception for lemlist connector (#3534)|https://github.com/apache/seatunnel/commit/705728ebbb|2.3.0| |[Feature][Connector V2] add gitlab source connector (#3408)|https://github.com/apache/seatunnel/commit/545595c6d2|2.3.0| |[Feature][Connector-V2][OneSignal]Add OneSignal source conector (#3454)|https://github.com/apache/seatunnel/commit/b318b3166f|2.3.0| |[Feature][Connector-V2][Klaviyo]Add Klaviyo source connector (#3443)|https://github.com/apache/seatunnel/commit/fc00a2866b|2.3.0| |[Feature][Connector-V2][Lemlist]Add Lemlist source connector (#3346)|https://github.com/apache/seatunnel/commit/12d66b4247|2.3.0| |[HotFix][Core][API] Fix OptionValidation error code (#3439)|https://github.com/apache/seatunnel/commit/ace219f376|2.3.0| |[Improve][Connector-V2][My Hours]Add http method enum && Improve My Hours connector option rule (#3390)|https://github.com/apache/seatunnel/commit/a86c9d90f7|2.3.0| |[Feature][Connector-V2][Http] Add option rules && Improve Myhours sink connector (#3351)|https://github.com/apache/seatunnel/commit/cc8bb60c83|2.3.0| |[Feature][Connector-V2][My Hours] Add My Hours Source Connector (#3228)|https://github.com/apache/seatunnel/commit/4104a3e30e|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Bug][format][json] Fix jackson package conflict with spark (#2934)|https://github.com/apache/seatunnel/commit/1a92b8369b|2.3.0-beta| |[Bug][Connector-V2] Fix wechat sink data serialization (#2856)|https://github.com/apache/seatunnel/commit/3aee11fc16|2.3.0-beta| |[Improve][Connector-V2] Improve http connector (#2833)|https://github.com/apache/seatunnel/commit/5b3957bc52|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Improve][build] Improved scope of maven-shade-plugin (#2665)|https://github.com/apache/seatunnel/commit/93bc8bd116|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Bug][Connector-V2] Fix the bug that set params by mistake (#2511) (#2513)|https://github.com/apache/seatunnel/commit/ead3d68b0e|2.2.0-beta| |[Improve][Connector-V2] Http source support user-defined schema (#2439)|https://github.com/apache/seatunnel/commit/793933b6b8|2.2.0-beta| | [Feature][Connector-V2] Add Enterprise Wechat sink connector (#2412)|https://github.com/apache/seatunnel/commit/3e200e0a38|2.2.0-beta| |[Improve][Connector-V2] Format SeaTunnelRow use seatunnel-format-json (#2435)|https://github.com/apache/seatunnel/commit/e4e8f7fbff|2.2.0-beta| |[Improve][Connector-V2] Make the attribute of http-connector from private to protected (#2418)|https://github.com/apache/seatunnel/commit/f3b00ef696|2.2.0-beta| |[Feature][Connector-V2] Add feishu sink (#2381)|https://github.com/apache/seatunnel/commit/0fec8ca438|2.2.0-beta| |[Feature][Connector-V2] Add http sink(Webhook) (#2348)|https://github.com/apache/seatunnel/commit/4b7207490a|2.2.0-beta| |[Improve][Http Connector-V2-Source] Refactor the code and make code more clearly (#2322)|https://github.com/apache/seatunnel/commit/a9a797ad85|2.2.0-beta| |[Improve][Connector-V2] Fix the log information (#2317)|https://github.com/apache/seatunnel/commit/736983a708|2.2.0-beta| |[Improve][Connector-V2] Http client provider improve (#2312)|https://github.com/apache/seatunnel/commit/cc950007c8|2.2.0-beta| |[Improve][Connector-V2] Fix 'Singleton' word error (#2309)|https://github.com/apache/seatunnel/commit/12ebcb4a0d|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-hudi.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-hugegraph.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-iceberg.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][Core] Unify the aws-sdk-v2 version to 2.31.30 (#9698)|https://github.com/apache/seatunnel/commit/41c251cc8a|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Bug] [Connector-V2] Fix the issue of writing the ORC format Iceberg report "Illegal provider-class name" (#6754) (#9588)|https://github.com/apache/seatunnel/commit/74b193dd5a|2.3.12| |[Bug] [Connector-V2] Updates Iceberg version to 1.6.1 (#9387) (#9451)|https://github.com/apache/seatunnel/commit/7b92a6c5c1|2.3.12| |[Fix][Connector-Iceberg] Fix Time Zone Issue for Iceberg Timestamp Type (#9460)|https://github.com/apache/seatunnel/commit/60cd497610|2.3.12| |[Feature][Connector-V2] Iceberg add glue catalog support (#9247)|https://github.com/apache/seatunnel/commit/ecff2e8618|2.3.11| |[Improve] Remove useless iceberg sink config `iceberg.table.config` (#9307)|https://github.com/apache/seatunnel/commit/fbdf39ebf2|2.3.11| |[Improve][connector-iceberg] fix schema change event (#9217)|https://github.com/apache/seatunnel/commit/56669095b7|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feat][Connector-v2][Iceberg]support filter conditions in iceberg source (#9095)|https://github.com/apache/seatunnel/commit/0eb72780ee|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Improve] iceberg options (#8967)|https://github.com/apache/seatunnel/commit/82a374ec87|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Iceberg] Support read multi-table (#8524)|https://github.com/apache/seatunnel/commit/2bfb97e502|2.3.10| |[Improve][Iceberg] Filter catalog table primaryKey is empty (#8413)|https://github.com/apache/seatunnel/commit/857aab5e83|2.3.9| |[Improve][Connector-V2] Reduce the create times of iceberg sink writer (#8155)|https://github.com/apache/seatunnel/commit/45a7a715a2|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Iceberg] Support custom delete sql for sink savemode (#8094)|https://github.com/apache/seatunnel/commit/29ca928c36|2.3.9| |[Improve][Connector-V2] Reduce the request times of iceberg load table (#8149)|https://github.com/apache/seatunnel/commit/555f5eb404|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Improve][Iceberg] Support table comment for catalog (#7936)|https://github.com/apache/seatunnel/commit/72ab38f317|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connector-V2] Fix iceberg throw java: package sun.security.krb5 does not exist when use jdk 11 (#7734)|https://github.com/apache/seatunnel/commit/116af4febc|2.3.8| |[Hotfix][Connector-V2] Release resources when task is closed for iceberg sinkwriter (#7729)|https://github.com/apache/seatunnel/commit/ff281183bd|2.3.8| |[Fix][Connector-V2] Fixed iceberg sink can not handle uppercase fields (#7660)|https://github.com/apache/seatunnel/commit/b7be0cb4a1|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Improve][Iceberg] Add savemode create table primaryKey testcase (#7641)|https://github.com/apache/seatunnel/commit/6b36f90f4d|2.3.8| |[Hotfix] Fix iceberg missing column comment when savemode create table (#7608)|https://github.com/apache/seatunnel/commit/b35bd94bfb|2.3.8| |[Improve][Connector-V2] Remove hard code iceberg table format version (#7500)|https://github.com/apache/seatunnel/commit/f49b263e65|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Feature][Connector-V2][Iceberg] Support Iceberg Kerberos (#7246)|https://github.com/apache/seatunnel/commit/e3001207c8|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Bug][Connector-Iceberg]fix create iceberg v2 table with pks (#6895)|https://github.com/apache/seatunnel/commit/40d2c1b213|2.3.6| |[Feature][Connector-V2] Iceberg-sink supports writing data to branches (#6697)|https://github.com/apache/seatunnel/commit/e3103535cc|2.3.6| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Zeta] Add classloader cache mode to fix metaspace leak (#6355)|https://github.com/apache/seatunnel/commit/9c3c2f183d|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature] Supports iceberg sink #6198 (#6265)|https://github.com/apache/seatunnel/commit/18d3e86194|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[BUG][Connector-V2] Iceberg source lost data with parallelism option (#5732)|https://github.com/apache/seatunnel/commit/7f3b4be075|2.3.4| |[Dependency]Bump org.apache.avro:avro in /seatunnel-connectors-v2/connector-iceberg (#5582)|https://github.com/apache/seatunnel/commit/13753a927b|2.3.4| |[Improve][Pom] Add junit4 to the root pom (#5611)|https://github.com/apache/seatunnel/commit/7b4f7db2a2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Doc][Iceberg] Improved iceberg documentation (#5335)|https://github.com/apache/seatunnel/commit/659a68a0be|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Hotfix][Connector][Iceberg] Fix iceberg source stream mode init error (#4638)|https://github.com/apache/seatunnel/commit/64760eed4d|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve][SourceConnector] Unifie Iceberg source fields to schema (#3959)|https://github.com/apache/seatunnel/commit/20e1255fab|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][Connector-V2][Iceberg] Unified exception for iceberg source connector (#3677)|https://github.com/apache/seatunnel/commit/e24843515f|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Iceberg] Modify the scope of flink-shaded-hadoop-2 to provided to be compatible with hadoop3.x (#3046)|https://github.com/apache/seatunnel/commit/b38c50789f|2.3.0| |[Feature][Connector V2] expose configurable options in Iceberg (#3394)|https://github.com/apache/seatunnel/commit/bd9a313ded|2.3.0| |[Improve][Connector][Iceberg] Improve code. (#3065)|https://github.com/apache/seatunnel/commit/9f38e3da74|2.3.0-beta| |[Code-Improve][Iceberg] Use automatic resource management to replace 'try - finally' code block. (#2909)|https://github.com/apache/seatunnel/commit/b7f640724b|2.3.0-beta| |[Feature][Connector-V2] Add iceberg source connector (#2615)|https://github.com/apache/seatunnel/commit/ffc6088a79|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-influxdb.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] influxdb options (#8966)|https://github.com/apache/seatunnel/commit/9f498b8133|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Improve some connectors prepare check error message (#7465)|https://github.com/apache/seatunnel/commit/6930a25edd|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |Support multi-table sink feature for influxdb (#6278)|https://github.com/apache/seatunnel/commit/56f13e920d|2.3.5| |[Improve][Zeta] Add classloader cache mode to fix metaspace leak (#6355)|https://github.com/apache/seatunnel/commit/9c3c2f183d|2.3.5| |[Test][E2E] Add thread leak check for connector (#5773)|https://github.com/apache/seatunnel/commit/1f2f3fc5f0|2.3.4| |[BugFix] [InfluxDBSource] Resolve invalid SQL in initColumnsIndex method caused by direct QUERY_LIMIT appendage with 'tz' function. (#4829)|https://github.com/apache/seatunnel/commit/deed9c62c3|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] [Connector-V2] Remove scheduler in InfluxDB sink (#5271)|https://github.com/apache/seatunnel/commit/f459f500cb|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][SourceConnector] Unifie InfluxDB source fields to schema (#3897)|https://github.com/apache/seatunnel/commit/85a984a64f|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Influxdb] Unified exception for influxdb source & sink connector (#3558)|https://github.com/apache/seatunnel/commit/4686f35d68|2.3.0| |[Feature][Connector][influx] Expose configurable options in influx db (#3392)|https://github.com/apache/seatunnel/commit/b247ff0aef|2.3.0| |[Feature][Connector-V2] influxdb sink connector (#3174)|https://github.com/apache/seatunnel/commit/630e884791|2.3.0| |[Feature][Connector-V2] Add influxDB connector source (#2697)|https://github.com/apache/seatunnel/commit/1d70ea3084|2.3.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-iotdb.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[improve] iotdb options (#8965)|https://github.com/apache/seatunnel/commit/6e073935f4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Doc] update iotdb document (#5404)|https://github.com/apache/seatunnel/commit/856aedb3c9|2.3.4| |[Improve] [Connector-V2] Remove scheduler in IoTDB sink (#5270)|https://github.com/apache/seatunnel/commit/299637868c|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Improve][SourceConnector] Unified schema parameter, update IoTDB sou… (#3896)|https://github.com/apache/seatunnel/commit/a0959c5fd1|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Iotdb] Unified exception for iotdb source & sink connector (#3557)|https://github.com/apache/seatunnel/commit/7353fed6d6|2.3.0| |[Feature][Connector V2] expose configurable options in IoTDB (#3387)|https://github.com/apache/seatunnel/commit/06359ea76a|2.3.0| |[Improve][Connector-V2][IotDB]Add IotDB sink parameter check (#3412)|https://github.com/apache/seatunnel/commit/91240a3dcb|2.3.0| |[Bug][Connector-v2] Fix IoTDB connector sink NPE (#3080)|https://github.com/apache/seatunnel/commit/e5edf02433|2.3.0-beta| |[Imporve][Connector-V2] Imporve iotdb connector (#2917)|https://github.com/apache/seatunnel/commit/3da11ce19b|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[chore][connector-common] Rename SeatunnelSchema to SeaTunnelSchema (#2538)|https://github.com/apache/seatunnel/commit/7dc2a27388|2.2.0-beta| |[Connectors-V2]Support IoTDB Source (#2431)|https://github.com/apache/seatunnel/commit/7b78d6c922|2.2.0-beta| |[Feature][Connector-V2] Support IoTDB sink (#2407)|https://github.com/apache/seatunnel/commit/c1bbbd59d5|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-jdbc.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-xugu] Fix several bugs in the xugu connector (#9820)|https://github.com/apache/seatunnel/commit/75c9adb280| dev | |[Feature][Transform-V2] Support `AT TIME ZONE` statement for sql transform (#9784)|https://github.com/apache/seatunnel/commit/ad5278c5bb| dev | |[Feature][Transform-V2] Support vector series sql function (#9765)|https://github.com/apache/seatunnel/commit/a40114cf7a|2.3.12| |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix] [connector-jdbc] prevent precision loss in Float to BigDecimal conversion (#9670)|https://github.com/apache/seatunnel/commit/6e11285bf6|2.3.12| |[Fix][Connector-Jdbc] Supports reading and writing Postgres network dress types (#9618)|https://github.com/apache/seatunnel/commit/3dc79c1ddf|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Fix][Connector-Jdbc]Fixed Vertica data source cannot upsert data. (#9607)|https://github.com/apache/seatunnel/commit/7b4d05171b|2.3.12| |[Fix][Connectors-Jdbc] Postgres supports streaming and batch reading and writing of the `interval` data type (#9590)|https://github.com/apache/seatunnel/commit/58ab917024|2.3.12| |[Feature][Connectors-v2] Optimize the size of CDC JAR Files (#9546)|https://github.com/apache/seatunnel/commit/1dd19c6823|2.3.12| |[improve][Connector-jdbc] add comments when schema not include all columns (#9559)|https://github.com/apache/seatunnel/commit/02d2b69d85|2.3.12| |[Hotfix][Connector-Jdbc] Write MySQL to support set collection data type (#9553)|https://github.com/apache/seatunnel/commit/3836c97a62|2.3.12| |[Feature][Jdbc] Support read multiple tables by regular expressions (#9380)|https://github.com/apache/seatunnel/commit/670a52a918|2.3.12| |[bugfix][Connector-V2] Fixed the load driver inaccurate situation (#9468)|https://github.com/apache/seatunnel/commit/c6639e81fe|2.3.12| |[Fix][Connector-V2] Fix OceanBase Oracle create unsupported data type (#9383)|https://github.com/apache/seatunnel/commit/f4178c72f1|2.3.12| |[improve][Connector-V2] delete jdbc param support_upsert_by_query_primary_key_exist (#9408)|https://github.com/apache/seatunnel/commit/d247fe1d8d|2.3.12| |[Feature][Connector-V2] Jdbc mysql support read tinyint(1) to byte(tinyint) (#9373)|https://github.com/apache/seatunnel/commit/7b87aa6f12|2.3.12| |[Improve] JdbcInputFormat nextRecord Exception throw TableId (#9374)|https://github.com/apache/seatunnel/commit/484aef593d|2.3.12| |[Feature][Connector-V2][JDBC] Add presto/trino dialect (#9388)|https://github.com/apache/seatunnel/commit/3cac2bd126|2.3.12| |[Feature][Connector-JDBC] Supprot read Oracle BLOB data as string instead of bytes (#9305)|https://github.com/apache/seatunnel/commit/454a88f81a|2.3.11| |[Fix][Connector-jdbc] Fix postgresql sink trying to update unique key (#9293) (#9298)|https://github.com/apache/seatunnel/commit/d0c1de8357|2.3.11| |[Fix][Connector-V2] Fix oceanbase mysql jdbc sink create statement error (#9267)|https://github.com/apache/seatunnel/commit/79f8125ea6|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][API] Fixed not invoke the `SinkAggregatedCommitter`'s init method (#9070)|https://github.com/apache/seatunnel/commit/df0d11d632|2.3.11| |[Fix][Connector-V2] Fix SqlServer create table when database with dot (#9007)|https://github.com/apache/seatunnel/commit/e09445c789|2.3.11| |[Fix][Connector-V2][OceanBase] oceanbase vector support simple vector index (#9072)|https://github.com/apache/seatunnel/commit/4140cd1d8f|2.3.11| |[Improve][Connector-V2] Optimize dialect selection in jdbc (#8820)|https://github.com/apache/seatunnel/commit/92c62c5e63|2.3.11| |[Fix][JDBC] fix jdbc default connection parameter invalid (#8185)|https://github.com/apache/seatunnel/commit/f85eb78b37|2.3.11| |[Hotfix][Jdbc] Fix mysql tinyint(1) type mapping for TypeMapper (#9012)|https://github.com/apache/seatunnel/commit/5f85d7668a|2.3.11| |[Feature][Jdbc] Add String type column split Support by charset-based splitting algorithm (#9002)|https://github.com/apache/seatunnel/commit/dbe41e74cd|2.3.11| |[Fix][Paimon] nullable and comment attribute was lost during automatic table creation (#9020)|https://github.com/apache/seatunnel/commit/eb54fdd52c|2.3.11| |[Fix][Connector-JDBC] Fix JDBC driver selection for data source connections (#8986)|https://github.com/apache/seatunnel/commit/a5aafa7301|2.3.11| |[Improve][Jdbc] Upgrade sap-hana driver from 2.14.7 to 2.23.10 (#9013)|https://github.com/apache/seatunnel/commit/9ba9f169be|2.3.11| |[Feature][Jdbc] Support sink ddl for sqlserver #8114 (#8936)|https://github.com/apache/seatunnel/commit/30aa485b38|2.3.10| |[Fix][Connector-V2] Fix parse SqlServer JDBC Url error (#8784)|https://github.com/apache/seatunnel/commit/373d2162d3|2.3.10| |[Improve][Jdbc] Support upsert for opengauss (#8627)|https://github.com/apache/seatunnel/commit/56110bf392|2.3.10| |[Improve][Jdbc] Remove useless utils. (#8793)|https://github.com/apache/seatunnel/commit/36a7533e85|2.3.10| |[Improve][Jdbc] Improve catalog connection cache (#8626)|https://github.com/apache/seatunnel/commit/6205065b25|2.3.10| |[Fix][Connector-V2] Fix jdbc sink statement buffer wrong time to clear (#8653)|https://github.com/apache/seatunnel/commit/cf35eecdfc|2.3.10| |[Feature][Jdbc] Support sink ddl for dameng (#8380)|https://github.com/apache/seatunnel/commit/5ff3427428|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Jdbc] Remove oracle 'v$database' query (#8571)|https://github.com/apache/seatunnel/commit/3cf09f61ca|2.3.10| |[Fix] [Connector-V2] Postgres support for multiple primary keys (#8526)|https://github.com/apache/seatunnel/commit/04db40d973|2.3.10| |[Feature][JDBC source] pg support char types (#8420)|https://github.com/apache/seatunnel/commit/776ac94478|2.3.9| |[Feature][Jdbc] Support sink ddl for postgresql (#8276)|https://github.com/apache/seatunnel/commit/353bbd21a1|2.3.9| |[Feature][Connector-V2] Support the jdbc connector for highgo db (#8282)|https://github.com/apache/seatunnel/commit/aa381cbfb4|2.3.9| |[Improve][Jdbc] Support nvarchar in dm (#8270)|https://github.com/apache/seatunnel/commit/2f1c54ee2e|2.3.9| |[Improve][Connector-v2] Use regex to match filedName placeholders in jdbc sink (#8222)|https://github.com/apache/seatunnel/commit/c02d4fed36|2.3.9| |[Improve][Connector-V2] Support read comment when jdbc dialect without catalog (#8196)|https://github.com/apache/seatunnel/commit/567cd54de5|2.3.9| |[Improve][Connector-V2] The interface supports jdbc respects the target database field type (#8031)|https://github.com/apache/seatunnel/commit/1de056a9a4|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Jdbc] Improve ddl write validate (#8158)|https://github.com/apache/seatunnel/commit/9cdaacddd9|2.3.9| |[Feature][Jdbc] Add Jdbc default dialect for all jdbc series database without dialect (#8132)|https://github.com/apache/seatunnel/commit/399eabcd3f|2.3.9| |[Improve][Jdbc] Refactor ddl change (#8134)|https://github.com/apache/seatunnel/commit/e1f0a238f7|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Improve][Connector-V2] Improve schema evolution on column insert after for mysql-jdbc (#8017)|https://github.com/apache/seatunnel/commit/3fb05da365|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Feature][transform] transform support explode (#7928)|https://github.com/apache/seatunnel/commit/132278c06a|2.3.9| |[Feature][Connector-v2] Support schema evolution for Oracle connector (#7908)|https://github.com/apache/seatunnel/commit/79406bcc2f|2.3.9| |[Improve][Connector-V2] Improve jdbc merge table from path and query when type is decimal (#7917)|https://github.com/apache/seatunnel/commit/8baa012ced|2.3.9| |[Fix][Connector-V2] Fix hana type loss of precision (#7912)|https://github.com/apache/seatunnel/commit/18dcca36cd|2.3.9| |[Feature][Connector-V2] Jdbc DB2 support upsert SQL (#7879)|https://github.com/apache/seatunnel/commit/139919334d|2.3.9| |[Improve][Jdbc] Optimize index name conflicts when create table for postgresql (#7875)|https://github.com/apache/seatunnel/commit/312ee866fb|2.3.9| |[Improve][Jdbc] Support postgresql inet type. (#7820)|https://github.com/apache/seatunnel/commit/25b68b3623|2.3.9| |[Fix][Connector-V2]Oceanbase vector database is added as the source server (#7832)|https://github.com/apache/seatunnel/commit/258f931765|2.3.9| |[Feature][connector-v2]Support opengauss jdbc connnector using opengauss driver. (#7622)|https://github.com/apache/seatunnel/commit/bbf643772e|2.3.9| |[Improve][Jdbc] Support save mode for the sink of jdbc-dm (#7814)|https://github.com/apache/seatunnel/commit/b87d732c81|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Connector-V2] SqlServer support user-defined type (#7706)|https://github.com/apache/seatunnel/commit/fb89033273|2.3.8| |[Hotfix][CDC] Fix ddl duplicate execution error when config multi_table_sink_replica (#7634)|https://github.com/apache/seatunnel/commit/23ab3edbbb|2.3.8| |[Feature][Connector-Paimon] Support dynamic bucket splitting improves Paimon writing efficiency (#7335)|https://github.com/apache/seatunnel/commit/bc0326cba8|2.3.8| |[Fix][Connector-V2] Fix jdbc test case failed (#7690)|https://github.com/apache/seatunnel/commit/4f5d27f625|2.3.8| |[Improve][Jdbc] Jdbc truncate table should check table not database (#7654)|https://github.com/apache/seatunnel/commit/0c0eb7e41b|2.3.8| |[Feature][Connector-V2] jdbc saphana source tablepath support view and synonym (#7670)|https://github.com/apache/seatunnel/commit/7e0c20a488|2.3.8| |[Fix][Connector-v2] Throw Exception in sql query for JdbcCatalog in table or db exists query (#7651)|https://github.com/apache/seatunnel/commit/70ec59ce0e|2.3.8| |[Fix][JDBC] Fix starrocks jdbc dialect catalog conflict with starrocks connector (#7578)|https://github.com/apache/seatunnel/commit/020aab422e|2.3.8| |[Feature] Support tidb cdc connector source #7199 (#7477)|https://github.com/apache/seatunnel/commit/87ec786bd6|2.3.8| |[bugfix] fix oracle query table length (#7627)|https://github.com/apache/seatunnel/commit/2e002ce09b|2.3.8| |[Hotfix][Connector-v2] Fix the NullPointerException for jdbc oracle which used the table_list (#7544)|https://github.com/apache/seatunnel/commit/555028217a|2.3.8| |[Improve][Connector-v2] Support mysql 8.1/8.2/8.3 for jdbc (#7530)|https://github.com/apache/seatunnel/commit/657fe69b26|2.3.8| |[Improve][Connector-v2] Release resource in closeStatements even exception occurred in executeBatch (#7533)|https://github.com/apache/seatunnel/commit/590f7d110d|2.3.8| |[Fix][Connector-V2] Fix jdbc query sql can not get table path (#7484)|https://github.com/apache/seatunnel/commit/8e0ca8f725|2.3.8| |[Feature][Connector-V2] Add `decimal_type_narrowing` option in jdbc (#7461)|https://github.com/apache/seatunnel/commit/696f2948fa|2.3.8| |[Improve][Connector-V2] update vectorType (#7446)|https://github.com/apache/seatunnel/commit/1bba72385b|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[FIX][E2E]Modify the OceanBase test case to the latest imageChange image (#7452)|https://github.com/apache/seatunnel/commit/6abb83deab|2.3.8| |[Feature][Connector-V2][OceanBase] Support vector types on OceanBase (#7375)|https://github.com/apache/seatunnel/commit/a6b188d552|2.3.8| |[Improve][Connector-V2] Remove system table limit (#7391)|https://github.com/apache/seatunnel/commit/adf888e008|2.3.8| |[Fix] Fix oracle sample data from column error (#7340)|https://github.com/apache/seatunnel/commit/2130e0d5ad|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Hotifx][Jdbc] Fix MySQL unsupport 'ZEROFILL' column type (#7407)|https://github.com/apache/seatunnel/commit/7130382123|2.3.8| |[Improvement] add starrocks jdbc dialect (#7294)|https://github.com/apache/seatunnel/commit/b5140f598e|2.3.8| |[Hotfix][Connector] Fix jdbc compile error (#7359)|https://github.com/apache/seatunnel/commit/2769ed5029|2.3.7| |[Fix][Connector-V2][OceanBase] Remove OceanBase catalog's dependency on mysql driver (#7311)|https://github.com/apache/seatunnel/commit/3130ae089e|2.3.7| |[Improve][Jdbc] Skip all index when auto create table to improve performance of write (#7288)|https://github.com/apache/seatunnel/commit/dc3c23981b|2.3.7| |[Improve][Jdbc] Remove MysqlType references in JdbcDialect (#7333)|https://github.com/apache/seatunnel/commit/16eeb1c123|2.3.7| |[Improve][Jdbc] Merge user config primary key when create table (#7313)|https://github.com/apache/seatunnel/commit/819c685651|2.3.7| |[Improve][Connector-v2] Optimize the way of databases and tables are checked for existence (#7261)|https://github.com/apache/seatunnel/commit/f012b2a6f0|2.3.7| |[Feature][Jdbc] Support hive compatibleMode add inceptor dialect (#7262)|https://github.com/apache/seatunnel/commit/31e59cdf82|2.3.6| |[Improve][Connector-v2] Optimize the count table rows for jdbc-oracle and oracle-cdc (#7248)|https://github.com/apache/seatunnel/commit/0d08b20061|2.3.6| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix] Fix Hana type converter decimal scale is 0 convert to int error (#7167)|https://github.com/apache/seatunnel/commit/6e33a97c86|2.3.6| |[Improve][Jdbc] Support write unicode text into sqlserver (#7159)|https://github.com/apache/seatunnel/commit/e44e8b93bc|2.3.6| |[Improve][Jdbc] Remove user info in catalog-table options (#7178)|https://github.com/apache/seatunnel/commit/4e001be25c|2.3.6| |[Improve][connector-v2-jdbc-mysql] Add support for MySQL 8.4 (#7151)|https://github.com/apache/seatunnel/commit/dbdbdf015b|2.3.6| |[Feature][Connector-V2] Support jdbc hana catalog and type convertor (#6950)|https://github.com/apache/seatunnel/commit/d663398739|2.3.6| |[Improve] Change catalog table log to debug level (#7136)|https://github.com/apache/seatunnel/commit/b111d2f843|2.3.6| |[Improve][Connector-V2] Support schema evolution for mysql-cdc and mysql-jdbc (#6929)|https://github.com/apache/seatunnel/commit/cf91e51fc7|2.3.6| |[connector-jdbc][bugfix] fix sqlServer create table comment special string bug (#7024)|https://github.com/apache/seatunnel/commit/403564db13|2.3.6| |[bugfix] fix pgsql create table comment special string bug (#7022)|https://github.com/apache/seatunnel/commit/9fe844f62a|2.3.6| |[connector-jdbc][bugfix] fix oracle create table comment special string bug (#7012)|https://github.com/apache/seatunnel/commit/a9e0f67873|2.3.6| |[bugfix] fix mysql create table comment special string bug (#6998)|https://github.com/apache/seatunnel/commit/904e9cf785|2.3.6| |[Improve][[Jdbc]sink sql support custom field.(#6515) (#6525)|https://github.com/apache/seatunnel/commit/ef3e61dbc4|2.3.6| |[Feature][Jdbc] Support redshift catalog (#6992)|https://github.com/apache/seatunnel/commit/8d5cbcee74|2.3.6| |[Improve][Connector-V2] Clean key name in catalog table (#6942)|https://github.com/apache/seatunnel/commit/a399ef48c6|2.3.6| |[Improve][Zeta] Move SaveMode behavior to master (#6843)|https://github.com/apache/seatunnel/commit/80cf91318d|2.3.6| |[Improve][Jdbc] Quotes the identifier for table path (#6951)|https://github.com/apache/seatunnel/commit/d70ec61f35|2.3.6| |[Hotfix][Jdbc] Fix oracle savemode create table (#6651)|https://github.com/apache/seatunnel/commit/4b6c13e8fc|2.3.6| |[Improve][JDBC Source] Fix Split can not be cancel (#6825)|https://github.com/apache/seatunnel/commit/ee3b7c3723|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Hotfix][Jdbc/CDC] Fix postgresql uuid type in jdbc read (#6684)|https://github.com/apache/seatunnel/commit/868ba4d7c7|2.3.6| |[Improve][Connector] Add some sqlserver IDENTITY type for catalog (#6822)|https://github.com/apache/seatunnel/commit/f698396555|2.3.6| |[Feature][Jdbc] Support the jdbc connector for InterSystems IRIS (#6797)|https://github.com/apache/seatunnel/commit/46600969bb|2.3.6| |[Fix][MySQL]: Fix MySqlTypeConverter could not be instantiated (#6781)|https://github.com/apache/seatunnel/commit/a5609d600e|2.3.6| |[Hotfix][Jdbc] Fix table/query columns order merge for jdbc catalog (#6771)|https://github.com/apache/seatunnel/commit/df1954d520|2.3.6| |[Fix] Fix Oracle type converter handle negative scale in number type (#6758)|https://github.com/apache/seatunnel/commit/6d710690c5|2.3.6| |[Improve][mysql-cdc] Support mysql 5.5 versions (#6710)|https://github.com/apache/seatunnel/commit/058f5594a3|2.3.6| |[Improve][Jdbc] Add quote identifier for sql (#6669)|https://github.com/apache/seatunnel/commit/849d748d3d|2.3.5| |[Improve][Jdbc] Increase tyepe converter when auto creating tables (#6617)|https://github.com/apache/seatunnel/commit/cc660206d8|2.3.5| |[feature][connector-v2] add xugudb connector (#6561)|https://github.com/apache/seatunnel/commit/80f392afbb|2.3.5| |[Hotfix] Fix DEFAULT TABLE problem (#6352)|https://github.com/apache/seatunnel/commit/cdb1856e84|2.3.5| |[Improve] Improve MultiTableSinkWriter prepare commit performance (#6495)|https://github.com/apache/seatunnel/commit/2086b0e8a6|2.3.5| |[Improve][JDBC] Optimized code style for getting jdbc field types (#6583)|https://github.com/apache/seatunnel/commit/ddca95f32c|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Jdbc] Support custom case-sensitive config for dameng (#6510)|https://github.com/apache/seatunnel/commit/d6dcb03bf3|2.3.5| |feat: jdbc support copy in statement. (#6443)|https://github.com/apache/seatunnel/commit/ca4a65fc00|2.3.5| |[Improve][Jdbc] Using varchar2 datatype store string in oracle (#6392)|https://github.com/apache/seatunnel/commit/14405fa8d4|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |Fix Jdbc sink target table name error (#6269)|https://github.com/apache/seatunnel/commit/2f62235e38|2.3.4| |[Improve][JDBC] Use PreparedStatement to sample data from column (#6242)|https://github.com/apache/seatunnel/commit/bd0e66d533|2.3.4| |[Improve][JDBC-sink] Improve query Approximate Total Row Count of a Table (#5972)|https://github.com/apache/seatunnel/commit/8156036a2f|2.3.4| |[Feature][JDBC、CDC] Support Short and Byte Type in spliter (#6027)|https://github.com/apache/seatunnel/commit/6f8d0a5040|2.3.4| |[Improve] Support `int identity` type in sql server (#6186)|https://github.com/apache/seatunnel/commit/1a8da1c843|2.3.4| |[Bugfix][JDBC、CDC] Fix Spliter Error in Case of Extensive Duplicate Data (#6026)|https://github.com/apache/seatunnel/commit/635c24e8b2|2.3.4| | [Feature][Connector-V2][Postgres-cdc]Support for Postgres cdc (#5986)|https://github.com/apache/seatunnel/commit/97438b9402|2.3.4| |Add date type and float type column split support (#6160)|https://github.com/apache/seatunnel/commit/b9a62e5c3f|2.3.4| |[Improve] Extend `SupportResourceShare` to spark/flink (#5847)|https://github.com/apache/seatunnel/commit/c69da93b87|2.3.4| |[Feature] Support `uuid` in postgres jdbc (#6185)|https://github.com/apache/seatunnel/commit/f56855098b|2.3.4| |[Feature][Connector-V2][Oracle-cdc]Support for oracle cdc (#5196)|https://github.com/apache/seatunnel/commit/aaef22b31b|2.3.4| |[Feature][Connector] update pgsql catalog for save mode (#6080)|https://github.com/apache/seatunnel/commit/84ce516929|2.3.4| |[Hotfix][Jdbc] Fix dameng catalog query table sql (#6141)|https://github.com/apache/seatunnel/commit/413fa74500|2.3.4| |[improve][catalog-postgres] Improve get column sql compatibility (#5664)|https://github.com/apache/seatunnel/commit/23ce592ad2|2.3.4| |[Feature][Connector] update oracle catalog for save mode (#6092)|https://github.com/apache/seatunnel/commit/dfbf92769c|2.3.4| |[Feature][Connectors-V2][Jdbc] Supports Sqlserver Niche Data Types (#6122)|https://github.com/apache/seatunnel/commit/6673f6f771|2.3.4| |[Improve][Connector-V2][Jdbc] Shade hikari in jdbc connector (#6116)|https://github.com/apache/seatunnel/commit/dd698c95bf|2.3.4| |[Feature][Connector] update sqlserver catalog for save mode (#6086)|https://github.com/apache/seatunnel/commit/edcaacecb1|2.3.4| |[Feature][Connector-V2][PostgresSql] add JDBC source support string type as partition key (#6079)|https://github.com/apache/seatunnel/commit/3522eb157c|2.3.4| |[Hotfix][Jdbc] Fix jdbc setFetchSize error (#6005)|https://github.com/apache/seatunnel/commit/d41af8a6ed|2.3.4| |Support using multiple hadoop account (#5903)|https://github.com/apache/seatunnel/commit/d69d88d1aa|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Hotfix][Split] Fix split key not support BigInteger type|https://github.com/apache/seatunnel/commit/5adf5d2b9a|2.3.4| |[Improve] Replace SeaTunnelRowType with TableSchema in the JdbcRowConverter|https://github.com/apache/seatunnel/commit/1cc1b1b8cd|2.3.4| |[Hotfix][Jdbc] Fix cdc updates were not filtering same primary key (#5923)|https://github.com/apache/seatunnel/commit/38d3b85814|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Bug] Fix Hive-Jdbc use krb5 overwrite kerberosKeytabPath (#5891)|https://github.com/apache/seatunnel/commit/f0b6092c15|2.3.4| |Reduce the time cost of getCatalogTable in jdbc (#5908)|https://github.com/apache/seatunnel/commit/51a3737578|2.3.4| |[Improve] Improve Jdbc connector error message when datatype unsupported (#5864)|https://github.com/apache/seatunnel/commit/69f79af3a4|2.3.4| |[Improve] Rename `getCountSql` to `getExistDataSql` (#5838)|https://github.com/apache/seatunnel/commit/2233b3a381|2.3.4| |[Fix] Fix read from Oracle Date type value lose time (#5814)|https://github.com/apache/seatunnel/commit/2d704e36bd|2.3.4| |[Improve][JdbcSource] Optimize catalog-table metadata merge logic (#5828)|https://github.com/apache/seatunnel/commit/7d8028a60b|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Hive JDBC Source] Support Hive JDBC Source Connector (#5424)|https://github.com/apache/seatunnel/commit/a64e177d06|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[Feature][Oracle] Support XMLTYPE data integration #5716 (#5723)|https://github.com/apache/seatunnel/commit/620f081adb|2.3.4| |[Fix] Fix Postgres create table test case failed (#5778)|https://github.com/apache/seatunnel/commit/b98b6bcee3|2.3.4| |[Improve][Jdbc] Fix database identifier (#5756)|https://github.com/apache/seatunnel/commit/dbfc8a670a|2.3.4| |[Fix] Fix PG will not create index when using auto create table #5721|https://github.com/apache/seatunnel/commit/e5fd88dbe7|2.3.4| |[Improve] Remove all useless `prepare`, `getProducedType` method (#5741)|https://github.com/apache/seatunnel/commit/ed94fffbb9|2.3.4| |[feature][connector-jdbc]Add Save Mode function and Connector-JDBC (MySQL) connector has been realized (#5663)|https://github.com/apache/seatunnel/commit/eff17ccbe5|2.3.4| |[Bug] [connector-jdbc] Nullable Column source have null data could be unexpected results. (#5560)|https://github.com/apache/seatunnel/commit/3f429e1f0a|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |[BUG][Connector-V2][Jdbc] support postgresql xml type (#5724)|https://github.com/apache/seatunnel/commit/5f5d4da13f|2.3.4| |[Improve][E2E][Jdbc] Enable IT case for Oceanbase Mysql mode (#5697)|https://github.com/apache/seatunnel/commit/879c2aa07c|2.3.4| |[Feature][Jdbc] Support read multiple tables (#5581)|https://github.com/apache/seatunnel/commit/33fa8ff248|2.3.4| |[Feature] Support multi-table sink (#5620)|https://github.com/apache/seatunnel/commit/81ac173189|2.3.4| |[Improve] Remove catalog tag for config file (#5645)|https://github.com/apache/seatunnel/commit/dc509aa080|2.3.4| |[Feature][Jdbc] Supporting more ways to configure connection parameters. (#5388)|https://github.com/apache/seatunnel/commit/d31e9478f7|2.3.4| |[Feature][Connector-V2][Jdbc] Add OceanBase catalog (#5439)|https://github.com/apache/seatunnel/commit/cd4b7ff7d2|2.3.4| |[BUGFIX][Catalog] oracle catalog create table repeat and oracle pg null point (#5517)|https://github.com/apache/seatunnel/commit/103da931f3|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Feature][Jdbc] Add Dameng catalog (#5451)|https://github.com/apache/seatunnel/commit/c23070919c|2.3.4| |[Feature] Add tidb datatype convertor (#5440)|https://github.com/apache/seatunnel/commit/61391bda9f|2.3.4| |[Feature][Connector-V2] jdbc connector supports Kingbase database (#4803)|https://github.com/apache/seatunnel/commit/9538567159|2.3.4| |[Feature][Catalog] Catalog add Case Conversion Definition (#5328)|https://github.com/apache/seatunnel/commit/7b5b28bdbe|2.3.4| |[Feature][Jdbc] Jdbc database support identifier (#5089)|https://github.com/apache/seatunnel/commit/38b6d6e4bb|2.3.4| |[Improve][Connector-v2][Jdbc] Refactor AbstractJdbcCatalog (#5096)|https://github.com/apache/seatunnel/commit/dde3104f76|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[bug][jdbc][oracle]Fix the Oracle number type mapping problem (#5209)|https://github.com/apache/seatunnel/commit/9d3c3de90d|2.3.3| |[BUG][Connector-V2][Jdbc] support postgresql json type (#5194)|https://github.com/apache/seatunnel/commit/7a862d14b7|2.3.3| |[Improve] [Connector-V2] Remove scheduler in JDBC sink #4736 (#5168)|https://github.com/apache/seatunnel/commit/3b0a393145|2.3.3| |[CI] Split updated modules integration test for part 5 (#5208)|https://github.com/apache/seatunnel/commit/18f14d6087|2.3.3| |[Bug] [connector-v2] PostgreSQL versions below 9.5 are compatible use cdc sync problem (#5120)|https://github.com/apache/seatunnel/commit/9af696a1dd|2.3.3| |[Improve][Connector-v2][Jdbc] check url not null throw friendly message (#5097)|https://github.com/apache/seatunnel/commit/b0815f2a95|2.3.3| |[Feature][Catalog] Add JDBC Catalog auto create table (#4917)|https://github.com/apache/seatunnel/commit/63eb137671|2.3.3| |[Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150)|https://github.com/apache/seatunnel/commit/32b7f2b690|2.3.3| |[Hotfix][Connector][Jdbc] Fix the problem of JdbcOutputFormat database connection leak (#4802)|https://github.com/apache/seatunnel/commit/4cc10e83e7|2.3.3| |[Feature][JDBC Sink] Add DM upsert support (#5073)|https://github.com/apache/seatunnel/commit/5e8d982e25|2.3.3| |[Improve] Improve savemode api (#4767)|https://github.com/apache/seatunnel/commit/4acd370d48|2.3.3| |[Feature][Connector-V2] JDBC source support string type as partition key (#4947)|https://github.com/apache/seatunnel/commit/d1d2677658|2.3.3| |[Feature][Connector-V2][Jdbc] Add oceanbase dialect factory (#4989)|https://github.com/apache/seatunnel/commit/7ba11cecdf|2.3.3| |Fix XA Transaction bug (#5020)|https://github.com/apache/seatunnel/commit/852fe104bc|2.3.3| |[Improve][CDC]Remove driver for cdc connector (#4952)|https://github.com/apache/seatunnel/commit/b65f40c3c9|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Improve][Connector-V2][Jdbc-Source] Support for Decimal types as splict keys (#4634)|https://github.com/apache/seatunnel/commit/d56bb1ba1c|2.3.3| |[Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878)|https://github.com/apache/seatunnel/commit/c30a2a1b1c|2.3.2| |[Hotfix][Jdbc] Fix XA DataSource crash(Oracle/Dameng/SqlServer) (#4866)|https://github.com/apache/seatunnel/commit/bde19b6377|2.3.2| |[Feature][Connector-v2] Add Snowflake Source&Sink connector (#4470)|https://github.com/apache/seatunnel/commit/06c59a25f3|2.3.2| |[Hotfix][Connector-V2][Jdbc] Fix the error of extracting primary key column in sink (#4815)|https://github.com/apache/seatunnel/commit/0eff3aeed0|2.3.2| |[Hotfix][Connector][Jdbc] Fix reconnect throw close statement exception (#4801)|https://github.com/apache/seatunnel/commit/ea3bc1a673|2.3.2| |[Hotfix][Connector][Jdbc] Fix sqlserver system table case sensitivity (#4806)|https://github.com/apache/seatunnel/commit/2ca7426d22|2.3.2| |[Hotfix][Jdbc][Oracle] Fix oracle sql table identifier (#4754)|https://github.com/apache/seatunnel/commit/84cb51ff83|2.3.2| |[Improve][Jdbc] Populate primary key when jdbc sink is created using CatalogTable (#4755)|https://github.com/apache/seatunnel/commit/4af3bf9015|2.3.2| |[Feature][PostgreSQL-jdbc] Supports GEOMETRY data type for PostgreSQL… (#4673)|https://github.com/apache/seatunnel/commit/a5af4d9b6e|2.3.2| |[Improve][Core] Add check of sink and source config to avoid null pointer exception. (#4734)|https://github.com/apache/seatunnel/commit/8f66ce96cb|2.3.2| |[Hotfix][JDBC-SINK] Fix TiDBCatalog without open (#4718)|https://github.com/apache/seatunnel/commit/34a7f3eaa4|2.3.2| |[Feature][E2E] Add mysql-cdc e2e testcase (#4639)|https://github.com/apache/seatunnel/commit/87001dfd16|2.3.2| |[Hotfix][JDBC Sink] Fix JDBC Sink oom bug (#4690)|https://github.com/apache/seatunnel/commit/08b6f992aa|2.3.2| |Improve the option rule for jdbc sink (#4694)|https://github.com/apache/seatunnel/commit/a6b3704414|2.3.2| |[feature][catalog] Support for multiplexing connections (#4550)|https://github.com/apache/seatunnel/commit/41277d7f78|2.3.2| |[Bugfix][Jdbc-Mysql Mysql-CDC] Fix MySQL BIT type incorrectly converted to Boolean type (#4671)|https://github.com/apache/seatunnel/commit/89b0099ff4|2.3.2| |[Hotfix][Jdbc[SqlServer] Fix sqlserver jdbc url parse (#4697)|https://github.com/apache/seatunnel/commit/b24c3226ec|2.3.2| |Revert "[Improve][Catalog] refactor catalog (#4540)" (#4628)|https://github.com/apache/seatunnel/commit/2d1933195d|2.3.2| |[Feature][Connector][Jdbc] Add DataTypeConvertor for JDBC-Postgres (#4575)|https://github.com/apache/seatunnel/commit/91f5125976|2.3.2| |[Improve][Catalog] refactor catalog (#4540)|https://github.com/apache/seatunnel/commit/b0a701cb83|2.3.2| |[Bug] [JDBC Source] fix split exception when source table is empty (#4570)|https://github.com/apache/seatunnel/commit/c73b9331ce|2.3.2| |[Feature][Connector][Jdbc] Add vertica connector. (#4303)|https://github.com/apache/seatunnel/commit/e6b4f98721|2.3.2| |[Hotfix][Catalog] Filter out unavailable constrain keys (#4557)|https://github.com/apache/seatunnel/commit/5e5859546a|2.3.2| |[Hotfix][Connector-V2][Jdbc] Simple sql has the highest priority (#4548)|https://github.com/apache/seatunnel/commit/74d4d24858|2.3.2| |[Improve][Connector-V2][Jdbc] Jdbc source supports factory SPI (#4264)|https://github.com/apache/seatunnel/commit/a97f33797d|2.3.2| |[Jdbc][Chore] improve the exception message when primary key not found in row (#4474)|https://github.com/apache/seatunnel/commit/06fa850da9|2.3.2| |[hotfix][JDBC] Fix the table name is not automatically obtained when multiple tables (#4514)|https://github.com/apache/seatunnel/commit/c84d6f8d11|2.3.2| |[Chore][Jdbc] add the log for sql and update some style (#4475)|https://github.com/apache/seatunnel/commit/a9e6503045|2.3.2| |[Hotfix][Connector-V2][Jdbc] Set default value to false of JdbcOption: generate_sink_sql (#4471)|https://github.com/apache/seatunnel/commit/7da11c2f44|2.3.2| |[feature][jdbc][TiDB] add TiDB catalog (#4438)|https://github.com/apache/seatunnel/commit/9a32db6fc0|2.3.2| |[Hotfix][Connector] Fix sqlserver catalog (#4441)|https://github.com/apache/seatunnel/commit/8540c7f9f3|2.3.2| |[Feature][CDC][SqlServer] Support multi-table read (#4377)|https://github.com/apache/seatunnel/commit/c4e3f2dc03|2.3.2| |[Improve][JdbcSink]Fix connection failure caused by connection timeout. (#4322)|https://github.com/apache/seatunnel/commit/e1f6d3b3fd|2.3.2| |[Hotfix][Connector-V2][Jdbc] Field aliases are not supported in the query of jdbc source. (#4158) (#4210)|https://github.com/apache/seatunnel/commit/3d7ff831f9|2.3.1| |Change file type to file_format_type in file source/sink (#4249)|https://github.com/apache/seatunnel/commit/973a2fae3c|2.3.1| |Change redshift type to lowercase (#4248)|https://github.com/apache/seatunnel/commit/10447ae103|2.3.1| |Add redshift datatype convertor (#4245)|https://github.com/apache/seatunnel/commit/b19011517f|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[hotfix] fixed jdbc IT error|https://github.com/apache/seatunnel/commit/dd20af0a9e|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][jdbc] use ReadonlyConfig instead of Config (#4236)|https://github.com/apache/seatunnel/commit/c90c58e243|2.3.1| |[Improve][Jdbc-sink] add database field to sink config (#4199)|https://github.com/apache/seatunnel/commit/ec368902f4|2.3.1| |[improve][jdbc] Reduce jdbc options configuration (#4218)|https://github.com/apache/seatunnel/commit/ddd8f808b5|2.3.1| |Fix mysql get default value (#4204)|https://github.com/apache/seatunnel/commit/6848434f2d|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[Improve] Remove AUTO_COMMIT To Optional In JDBC OptionRule (#4194)|https://github.com/apache/seatunnel/commit/9d088017a3|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[improve][catalog][jdbc] Add MySQL catalog factory (#4168)|https://github.com/apache/seatunnel/commit/95e3cbf875|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |Add Kafka catalog (#4106)|https://github.com/apache/seatunnel/commit/34f1f21e48|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |Add DataTypeConvertor in Catalog (#4094)|https://github.com/apache/seatunnel/commit/840c3e5eb4|2.3.1| |[Feature] [Catalog] Support create/drop table, create/drop database in catalog (#4075)|https://github.com/apache/seatunnel/commit/d8a0be84ca|2.3.1| | [Bug][Connector-V2][Jdbc] Fixed no exception throwing problem (#3957)|https://github.com/apache/seatunnel/commit/6ab266e594|2.3.1| |[Bug][CDC] Fix jdbc sink generate update sql (#3940)|https://github.com/apache/seatunnel/commit/233465d4e4|2.3.1| |[Improve][JDBC] improve jdbc sink option (#3864)|https://github.com/apache/seatunnel/commit/768a9300e8|2.3.1| |Fix Source Class Support Parallelism judge & Add UT for it (#3878)|https://github.com/apache/seatunnel/commit/ce85a8c68b|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][Connector-V2] Jdbc connector support SAP HANA. (#3017)|https://github.com/apache/seatunnel/commit/fe0180fab2|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][JDBC Connector]improve option rule (#3802)|https://github.com/apache/seatunnel/commit/139256741a|2.3.1| |[Hotfix][Jdbc Sink] fix xa transaction commit failure on pipeline restore (#3809)|https://github.com/apache/seatunnel/commit/39dae4cfd9|2.3.1| |[Improve][Connector-V2][JDBC] Add exactly-once for JDBC source connector (#3750)|https://github.com/apache/seatunnel/commit/5328e9d847|2.3.1| |[Improve][Connector-v2] Remove unused options for jdbc source factory (#3794)|https://github.com/apache/seatunnel/commit/861004d309|2.3.1| |[Feature][Connector-jdbc] Fix JDBC Connector Throw Exception Error. (#3796)|https://github.com/apache/seatunnel/commit/38646b11b8|2.3.1| |[hotfix][ST-Engine] fix jdbc connector exactly-once null pointer (#3730)|https://github.com/apache/seatunnel/commit/0c5986fbec|2.3.0| |[Improve][connector-jdbc] Add config item enable upsert by query (#3708)|https://github.com/apache/seatunnel/commit/e1f951f782|2.3.0| |[Hotfix][connector-v2] fix SemanticXidGenerator#generateXid indexOutOfBounds #3701 (#3705)|https://github.com/apache/seatunnel/commit/f351ceaf4b|2.3.0| |[Hotfix][Connector-V2][jdbc] fix jdbc connection reset bug (#3670)|https://github.com/apache/seatunnel/commit/6fe0e6aece|2.3.0| |[Improve][Connector-V2][JDBC] Unified exception for JDBC source & sink (#3598)|https://github.com/apache/seatunnel/commit/865ca2bba9|2.3.0| |[Connector][JDBC]Support Redshift sink and source (#3615)|https://github.com/apache/seatunnel/commit/8d9d8638d2|2.3.0| |[Improve][Connectors-V2][jdbc] Adapts to multiple versions of Flink #3589|https://github.com/apache/seatunnel/commit/e77fdbbef7|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Doris]Add Doris Source & Sink connector (#3586)|https://github.com/apache/seatunnel/commit/3d46b79614|2.3.0| |[Feature][Connector-V2][Teradata] Add Teradata Source And Sink Connector|https://github.com/apache/seatunnel/commit/3a095d30fd|2.3.0| |[Feature][Connector-V2][JDBC] support sqlite Source & Sink (#3089)|https://github.com/apache/seatunnel/commit/a73bb3e714|2.3.0| |Bump postgresql in /seatunnel-connectors-v2/connector-jdbc (#3559)|https://github.com/apache/seatunnel/commit/c8dfdf3e46|2.3.0| |[feature][connector][cdc] add SeaTunnelRowDebeziumDeserializeSchema (#3499)|https://github.com/apache/seatunnel/commit/ff44db116e|2.3.0| |[JDBC] [ORACLE] Improve Oracle Type to SeaTunnel Type Mapping (#3486)|https://github.com/apache/seatunnel/commit/8fe0dda6e2|2.3.0| |[JDBC] [Config] Add JDBC Fetch Size Config And Custom Postgres PrepareStatement (#3478)|https://github.com/apache/seatunnel/commit/d60a705f5d|2.3.0| |[feature][connector][jdbc] expose configurable options in JDBC (#3410)|https://github.com/apache/seatunnel/commit/72b8a73cab|2.3.0| |[feature][connector][jdbc] Support write cdc changelog event in jdbc sink (#3444)|https://github.com/apache/seatunnel/commit/b12a908f01|2.3.0| |[Improve][Connector-v2][Jdbc] Add AutoCommit to jdbcConfig (#3453)|https://github.com/apache/seatunnel/commit/cfb1e97853|2.3.0| |[Improve][Connector-v2] Unset AutoCommit default to true (#3451)|https://github.com/apache/seatunnel/commit/439f686d92|2.3.0| |[Feature][connector-v2] add tablestore source and sink (#3309)|https://github.com/apache/seatunnel/commit/ebebf0b633|2.3.0| |Close jdbc connection after use. (#3358)|https://github.com/apache/seatunnel/commit/219fea517c|2.3.0| |[Improve] [Engine] Improve Engine performance. (#3216)|https://github.com/apache/seatunnel/commit/7393c47327|2.3.0| |[Bug][Connector-V2][JDBC]fix jdbc split bug (#3220)|https://github.com/apache/seatunnel/commit/40d67ab902|2.3.0| |[Feature][Connector-V2][JDBC] Support DB2 Source & Sink (#2410)|https://github.com/apache/seatunnel/commit/bf1ef69e84|2.3.0| |update org.postgresql:postgresql 42.3.3 to 42.4.1 (#3097)|https://github.com/apache/seatunnel/commit/2852516490|2.3.0| |[Feature][Connector-V2][Jdbc] support gbase 8a (#3026)|https://github.com/apache/seatunnel/commit/dc6e85d06f|2.3.0-beta| |[Bug] [sqlserver] timestamp convert exception (#3024)|https://github.com/apache/seatunnel/commit/99ac1a655e|2.3.0-beta| |[Feature][Connector-V2] oracle connector (#2550)|https://github.com/apache/seatunnel/commit/384ece1913|2.3.0-beta| |[Improve][Connector-v2][jdbc] Support for specify number of partitions when parallel reading (#2950)|https://github.com/apache/seatunnel/commit/fc284ac32e|2.3.0-beta| |[Feature][Connector-V2] add sqlserver connector (#2646)|https://github.com/apache/seatunnel/commit/05d105dea3|2.3.0-beta| |[Improve][e2e] Unified e2e IT for DaMengDB (#2946)|https://github.com/apache/seatunnel/commit/15636bdea1|2.3.0-beta| |[Improve][e2e] modify DM-driver by downLoad and add the value comparison of all columns (#2772)|https://github.com/apache/seatunnel/commit/f3ff39bdfe|2.3.0-beta| |[Improve][e2e] Improve jdbc driver management (#2770)|https://github.com/apache/seatunnel/commit/f907927a35|2.3.0-beta| |[hotfix][connector][jdbc] fix JDBC split exception (#2904)|https://github.com/apache/seatunnel/commit/57342c6545|2.3.0-beta| |[Improve][connector-jdbc] Calculate splits only once in JdbcSourceSplitEnumerator (#2900)|https://github.com/apache/seatunnel/commit/7622f28999|2.3.0-beta| |[Feature] [Connector-V2 E2E] Add mysql and postgres e2e test and bug fix (#2838)|https://github.com/apache/seatunnel/commit/db434adc15|2.2.0-beta| |fix XAConnection being wrongly submitted (#2805)|https://github.com/apache/seatunnel/commit/d9a6039fd3|2.2.0-beta| |fix spark execute exception is not thrown (#2791)|https://github.com/apache/seatunnel/commit/b1711c984e|2.2.0-beta| |[Improve][e2e] Add driver-jar to lib (#2719)|https://github.com/apache/seatunnel/commit/d64d452c86|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Connector-V2][JDBC-connector] support Jdbc dm (#2377)|https://github.com/apache/seatunnel/commit/7278209ca2|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Bug] [connector-jdbc-v2] Fix transaction force commit when autoCommit is enabled (#2636)|https://github.com/apache/seatunnel/commit/8cd8cf7aa2|2.2.0-beta| | [Feature][Connector-V2] Add phoenix connector sink (#2499)|https://github.com/apache/seatunnel/commit/05ccf9d68c|2.2.0-beta| |[Connector-V2][JDBC] Support database: greenplum (#2429)|https://github.com/apache/seatunnel/commit/3561d3878f|2.2.0-beta| |Add jdbc connector e2e test (#2321)|https://github.com/apache/seatunnel/commit/5fbcb811c6|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |update the condition to 1 = 0 about get table operation (#2186)|https://github.com/apache/seatunnel/commit/7c56d7143b|2.2.0-beta| |[SeaTunnel API] [Sink] remove useless context field (#2124)|https://github.com/apache/seatunnel/commit/a31fdeedcc|2.2.0-beta| |[bugfix] Check isOpen before closing (#2107)|https://github.com/apache/seatunnel/commit/7ec0ada2b9|2.2.0-beta| |[API-DRAFT] [MERGE] fix merge error|https://github.com/apache/seatunnel/commit/3c0e984648|2.2.0-beta| |merge dev to api-draft|https://github.com/apache/seatunnel/commit/d265597c64|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-kafka.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][Connector-V2] Optimize start mode of kafka recovery job (#9736)|https://github.com/apache/seatunnel/commit/bbde7f6339|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Fix][Connector-V2] Add Filter for Partitions to Prevent Blocking in KafkaConsumer StreamMode (#9598)|https://github.com/apache/seatunnel/commit/bd24fa77cb|2.3.12| |[Fix][Connecotr-kafka] Fix kafka IllegalArgumentException when offset is -1 (#9376)|https://github.com/apache/seatunnel/commit/142aca7b70|2.3.12| |[Feature][Connectors-V2] Add end_timestamp for timstamp start mode (#9318)|https://github.com/apache/seatunnel/commit/68b0504da9|2.3.11| |[Bugifx][kafka] Fix kafka enumerator assign split NPE (#9220)|https://github.com/apache/seatunnel/commit/7ca0c0c7e4|2.3.11| | [Fix][Connector-V2] Fix kafka database name (#9201)|https://github.com/apache/seatunnel/commit/79d9a937ee|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] assign size for KafkaSource reader cache queue (#9041)|https://github.com/apache/seatunnel/commit/8a9db476bd|2.3.11| |[Feature][Kafka] Support native format read/write kafka record (#8724)|https://github.com/apache/seatunnel/commit/86e2d6fcfa|2.3.10| |[improve] update kafka source default schema from content<ROW<content STRING>> to content<STRING> (#8642)|https://github.com/apache/seatunnel/commit/db6e2994d4|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] kafka connector options (#8616)|https://github.com/apache/seatunnel/commit/aadfe99f88|2.3.10| |[Fix] [Kafka Source] kafka source use topic as table name instead of fullName (#8401)|https://github.com/apache/seatunnel/commit/3d4f4bb33a|2.3.10| |[Feature][Kafka] Add `debezium_record_table_filter` and fix error (#8391)|https://github.com/apache/seatunnel/commit/b27a30a5aa|2.3.9| |[Bug][Kafka] kafka reads repeatedly (#8465)|https://github.com/apache/seatunnel/commit/f67f27279a|2.3.9| |[Hotfix][Connector-V2][kafka] fix kafka sink config exactly-once exception (#7857)|https://github.com/apache/seatunnel/commit/92b3253a5b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Kafka] Support custom topic for debezium compatible format (#8145)|https://github.com/apache/seatunnel/commit/deefe8762a|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Fix][Kafka] Fix in kafka streaming mode can not read incremental data (#7871)|https://github.com/apache/seatunnel/commit/a0eeeb9b62|2.3.9| |[Feature][Core] Support cdc task ddl restore for zeta (#7463)|https://github.com/apache/seatunnel/commit/8e322281ed|2.3.9| |[Fix][Connector-V2] Fix kafka `format_error_handle_way` not work (#7838)|https://github.com/apache/seatunnel/commit/63c7b4e9cc|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][kafka] Add arg poll.timeout for interval poll messages (#7606)|https://github.com/apache/seatunnel/commit/09d12fc40e|2.3.8| |[Improve][Kafka] kafka source refactored some reader read logic (#6408)|https://github.com/apache/seatunnel/commit/10598b6aec|2.3.8| |[Feature][connector-v2]Add Kafka Protobuf Data Parsing Support (#7361)|https://github.com/apache/seatunnel/commit/51c8e1a834|2.3.8| |[Hotfix][Connector] Fix kafka consumer log next startup offset (#7312)|https://github.com/apache/seatunnel/commit/891652399e|2.3.7| |[Fix][Connector kafka]Fix Kafka consumer stop fetching after TM node restarted (#7233)|https://github.com/apache/seatunnel/commit/7dc3fa8a13|2.3.6| |[Fix][Connector-V2] Fix kafka batch mode can not read all message (#7135)|https://github.com/apache/seatunnel/commit/1784c01a35|2.3.6| |[Feature][connector][kafka] Support read Maxwell format message from kafka #4415 (#4428)|https://github.com/apache/seatunnel/commit/4281b867ac|2.3.6| |[Hotfix][Connector-V2][kafka]Kafka consumer group automatically commits offset logic error fix (#6961)|https://github.com/apache/seatunnel/commit/181f01ee52|2.3.6| |[Improve][CDC] Bump the version of debezium to 1.9.8.Final (#6740)|https://github.com/apache/seatunnel/commit/c3ac953524|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Fix][Kafka-Sink] fix kafka sink factory option rule (#6657)|https://github.com/apache/seatunnel/commit/37578e103f|2.3.5| |[Feature][Connector-V2] Remove useless code for kafka connector (#6157)|https://github.com/apache/seatunnel/commit/0f286d1627|2.3.4| |[Feature] support avro format (#5084)|https://github.com/apache/seatunnel/commit/93a006156d|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][formats][ogg] Support read ogg format message #4201 (#4225)|https://github.com/apache/seatunnel/commit/7728e241e8|2.3.4| |[Improve] Remove all useless `prepare`, `getProducedType` method (#5741)|https://github.com/apache/seatunnel/commit/ed94fffbb9|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |KafkaSource use Factory to create source (#5635)|https://github.com/apache/seatunnel/commit/1c6176e518|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Feature][Connector-V2] connector-kafka source support data conversion extracted by kafka connect source (#4516)|https://github.com/apache/seatunnel/commit/bd74989099|2.3.3| |[Feature][connector][kafka] Support read debezium format message from kafka (#5066)|https://github.com/apache/seatunnel/commit/53a1f0c6c1|2.3.3| |[hotfix][kafka] Fix the problem that the partition information cannot be obtained when kafka is restored (#4764)|https://github.com/apache/seatunnel/commit/c203ef5f8d|2.3.2| |Fix the processing bug of abnormal parsing method of kafkaSource format. (#4687)|https://github.com/apache/seatunnel/commit/228257b2e2|2.3.2| |[hotfix][e2e][kafka] Fix the job not stopping (#4600)|https://github.com/apache/seatunnel/commit/93471c9ade|2.3.2| |[Improve][connector][kafka] Set default value for partition option (#4524)|https://github.com/apache/seatunnel/commit/884f733c3d|2.3.2| |[chore] delete unavailable S3 & Kafka Catalogs (#4477)|https://github.com/apache/seatunnel/commit/e0aec5ecec|2.3.2| |[Feature][API] Add options check before create source and sink and transform in FactoryUtil (#4424)|https://github.com/apache/seatunnel/commit/38f1903be2|2.3.2| |[Feature][Connector-V2][Kafka] Kafka source supports data deserialization failure skipping (#4364)|https://github.com/apache/seatunnel/commit/e1ed22b153|2.3.2| |[Bug][Connector-v2][KafkaSource]Fix KafkaConsumerThread exit caused by commit offset error. (#4379)|https://github.com/apache/seatunnel/commit/71f4d0c784|2.3.2| |[Bug][Connector-v2][KafkaSink]Fix the permission problem caused by client.id. (#4246)|https://github.com/apache/seatunnel/commit/3cdb7cfa4d|2.3.2| |Fix KafkaProducer resources have never been released. (#4302)|https://github.com/apache/seatunnel/commit/f99f02caa2|2.3.2| |[Improve][CDC] Optimize options & add docs for compatible_debezium_json (#4351)|https://github.com/apache/seatunnel/commit/336f590498|2.3.1| |[Hotfix][Zeta] Fix TaskExecutionService Deploy Failed The Job Can't Stop (#4265)|https://github.com/apache/seatunnel/commit/cf55b070bb|2.3.1| |[Feature][CDC] Support export debezium-json format to kafka (#4339)|https://github.com/apache/seatunnel/commit/5817ec07bf|2.3.1| |[Improve]]Connector-V2\[Kafka] Set kafka consumer default group (#4271)|https://github.com/apache/seatunnel/commit/82c784a3ef|2.3.1| |[chore] Fix the words of `canal` & `kafka` (#4261)|https://github.com/apache/seatunnel/commit/077a8d27a7|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Imprve][Connector-V2][Hive] Support read text table & Column projection (#4105)|https://github.com/apache/seatunnel/commit/717620f542|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |Add convertor factory (#4119)|https://github.com/apache/seatunnel/commit/cbdea45d95|2.3.1| |Add ElasticSearch catalog (#4108)|https://github.com/apache/seatunnel/commit/9ee4d8394c|2.3.1| |Add Kafka catalog (#4106)|https://github.com/apache/seatunnel/commit/34f1f21e48|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| | [Feature][Json-format][canal] Support read canal format message (#3950)|https://github.com/apache/seatunnel/commit/b80be72c85|2.3.1| |[Improve][Connector-V2][Kafka] Support extract topic from SeaTunnelRow field (#3742)|https://github.com/apache/seatunnel/commit/8aff807305|2.3.1| |[Feature][shade][Jackson] Add seatunnel-jackson module (#3947)|https://github.com/apache/seatunnel/commit/5d8862ec9c|2.3.1| |[Hotfix][Connector-V2][Kafka] Fix the bug that kafka consumer is not close. (#3836)|https://github.com/apache/seatunnel/commit/3447266427|2.3.1| |fix commit kafka offset bug. (#3933)|https://github.com/apache/seatunnel/commit/e60ad938be|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Bug][KafkaSource]Fix the default value of commit_on_checkpoint. (#3831)|https://github.com/apache/seatunnel/commit/df969849f6|2.3.1| |[Bug][KafkaSource]Failed to parse offset format (#3810)|https://github.com/apache/seatunnel/commit/8e1196accf|2.3.1| |[Improve] [Connector-V2] Kafka client user configured clientid is preferred (#3783)|https://github.com/apache/seatunnel/commit/aacf0abc04|2.3.1| |[Improve] [Connector-V2] Fix Kafka sink can't run EXACTLY_ONCE semantics (#3724)|https://github.com/apache/seatunnel/commit/5e3f196e29|2.3.0| |[Improve] [Connector-V2] fix kafka admin client can't get property config (#3721)|https://github.com/apache/seatunnel/commit/74c3351700|2.3.0| |[Improve][Connector-V2][Kafka] Add text format for kafka sink connector (#3711)|https://github.com/apache/seatunnel/commit/74bbd76b65|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Kafka]Unified exception for Kafka source and sink connector (#3574)|https://github.com/apache/seatunnel/commit/3b573798db|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[Improve][Connector-V2-kafka] Support for dynamic discover topic & partition in streaming mode (#3125)|https://github.com/apache/seatunnel/commit/999cfd6069|2.3.0| |[Improve][Connector-V2][Kafka] Support to specify multiple partition keys (#3230)|https://github.com/apache/seatunnel/commit/f65f44f44c|2.3.0| |[Feature][Connector-V2][Kafka] Add Kafka option rules (#3388)|https://github.com/apache/seatunnel/commit/cc0cb8cdb8|2.3.0| |[Improve][Connector-V2][Kafka]Improve kafka metadata code format (#3397)|https://github.com/apache/seatunnel/commit/379da3097f|2.3.0| |[Improve][Connector-V2-kafka] Support setting read starting offset or time at startup config (#3157)|https://github.com/apache/seatunnel/commit/3da19d4444|2.3.0| |update (#3150)|https://github.com/apache/seatunnel/commit/2b44992750|2.3.0-beta| |[Feature][connectors-v2][kafka] Kafka supports custom schema #2371 (#2783)|https://github.com/apache/seatunnel/commit/6506e306eb|2.3.0-beta| |[feature][connector][kafka] Support extract partition from SeaTunnelRow fields (#3085)|https://github.com/apache/seatunnel/commit/385e1f42c0|2.3.0-beta| |[Improve][connector][kafka] sink support custom partition (#3041)|https://github.com/apache/seatunnel/commit/ebddc18c41|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Imporve][Connector-V2]Parameter verification for connector V2 kafka sink (#2866)|https://github.com/apache/seatunnel/commit/254223fdb9|2.3.0-beta| |[Connector-V2] [Kafka] Fix Kafka Streaming problem (#2759)|https://github.com/apache/seatunnel/commit/e92e7b7283|2.2.0-beta| |[Improve][Connector-V2] Fix kafka connector (#2745)|https://github.com/apache/seatunnel/commit/90ce3851db|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-kudu.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[Feature][connector-kudu] implement the filter (#9405)|https://github.com/apache/seatunnel/commit/2714dd1105|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] kudu options (#9162)|https://github.com/apache/seatunnel/commit/e7edafdbac|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Transform] Rename sql transform table name from 'fake' to 'dual' (#8298)|https://github.com/apache/seatunnel/commit/e6169684fb|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][API] Unified tables_configs and table_list (#8100)|https://github.com/apache/seatunnel/commit/84c0b8d660|2.3.9| |[Feature][Core] Rename `result_table_name`/`source_table_name` to `plugin_input/plugin_output` (#8072)|https://github.com/apache/seatunnel/commit/c7bbd322db|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |correct the typo of kudu kerberos config (#6905)|https://github.com/apache/seatunnel/commit/fcb8554972|2.3.6| |[Fix][KuduCatalogFactory]: Fix KuduCatalogFactory.optionRule() will throw an Exception (#6787)|https://github.com/apache/seatunnel/commit/45a4e1532d|2.3.6| |[Feature][Engine] Unify job env parameters (#6003)|https://github.com/apache/seatunnel/commit/2410ab38f0|2.3.4| |[Feature][Connector-V2] Support multi-table sink feature for kudu (#5951)|https://github.com/apache/seatunnel/commit/82460c0bf0|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Feature][Kudu] Support multi-table source read (#5878)|https://github.com/apache/seatunnel/commit/8d9a0b7d11|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on kudu (#5789)|https://github.com/apache/seatunnel/commit/10e791d60a|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Feature][Kudu] Refactor Kudu functionality and Sink support CDC data. (#5437)|https://github.com/apache/seatunnel/commit/22110eb7b3|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][Connector-V2] Fix connector source snapshot state NPE (#4027)|https://github.com/apache/seatunnel/commit/e39c4988cc|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve] [Connector-V2] Bad smell ToArrayCallWithZeroLengthArrayArgument: (#3577)|https://github.com/apache/seatunnel/commit/cc448d98c4|2.3.0| |[Improve][Connector-V2][Kudu] Unified exception for kudu source & sink connector (#3564)|https://github.com/apache/seatunnel/commit/273418ddc9|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[Feature][Connector V2] expose configurable options in Kudu (#3365)|https://github.com/apache/seatunnel/commit/c422210e2c|2.3.0| |[Feature][Core][Connector-V2] Unified The way of setting JobName (#2908)|https://github.com/apache/seatunnel/commit/bf2c97484b|2.3.0-beta| |remove duplicate ExceptionUtil class (#3037)|https://github.com/apache/seatunnel/commit/c9dc7c50c2|2.3.0-beta| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2]Kudu Sink Connector Support to upsert row|https://github.com/apache/seatunnel/commit/1ece805ab1|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Connector-V2] Add Kudu source and sink connector (#2254)|https://github.com/apache/seatunnel/commit/0483cbc2df|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-lance.md ================================================
    Change Log | Change | Commit | Version | |--------|--------|---------|
    ================================================ FILE: docs/zh/connectors/changelog/connector-maxcompute.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-milvus.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-mongodb.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[fix][connector-mango] fix split with avgSize zero error (#9255)|https://github.com/apache/seatunnel/commit/564863b933|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][MongoDB] The Long type cannot handle string values in scientific notation (#8783)|https://github.com/apache/seatunnel/commit/00f550e3d0|2.3.11| |[Improve] sink mongodb schema is not required (#8887)|https://github.com/apache/seatunnel/commit/3cfe8c12b9|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Fix][Connector-Mongodb] close MongodbClient when close MongodbReader (#8592)|https://github.com/apache/seatunnel/commit/06b2fc0e06|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Bug][connectors-v2] fix mongodb bson convert exception (#8044)|https://github.com/apache/seatunnel/commit/b222c13f2f|2.3.9| |[Hotfix][Connector-v2] Fix the ClassCastException for connector-mongodb (#7586)|https://github.com/apache/seatunnel/commit/dc43370e8c|2.3.8| |[Improve][Test][Connector-V2][MongoDB] Add few test cases for BsonToRowDataConverters (#7579)|https://github.com/apache/seatunnel/commit/a797041e5d|2.3.8| |[Improve][Connector-V2][MongoDB] A BsonInt32 will be convert to a long type (#7567)|https://github.com/apache/seatunnel/commit/adf26c20c5|2.3.8| |[Improve][Connector-V2][MongoDB] Support to convert to double from any numeric type (#6997)|https://github.com/apache/seatunnel/commit/c5159a2760|2.3.6| |[bugfix][connector-mongodb] fix mongodb null value write (#6967)|https://github.com/apache/seatunnel/commit/c5ecda50f8|2.3.6| |[Improve][MongoDB] Implement TableSourceFactory to create mongodb source (#5813)|https://github.com/apache/seatunnel/commit/59cccb6097|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[bugfix][mongodb] Fixed unsupported exception caused by bsonNull (#5659)|https://github.com/apache/seatunnel/commit/cab864aa4d|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Improve][Connector-v2][Mongodb]sink support transaction update/writing (#5034)|https://github.com/apache/seatunnel/commit/b1203c905e|2.3.3| |[Hotfix][Connector-V2][Mongodb] Compatible with historical parameters (#4997)|https://github.com/apache/seatunnel/commit/31db35bee7|2.3.3| |[Improve][Connector-v2][Mongodb]Optimize reading logic (#5001)|https://github.com/apache/seatunnel/commit/830196d8b7|2.3.3| |[Hotfix][Connector-V2][Mongodb] Fix document error content and remove redundant code (#4982)|https://github.com/apache/seatunnel/commit/526197af67|2.3.3| |[Feature][connector-v2][mongodb] mongodb support cdc sink (#4833)|https://github.com/apache/seatunnel/commit/cb651cd7f3|2.3.3| |[Feature][Connector-v2][Mongodb]Refactor mongodb connector (#4620)|https://github.com/apache/seatunnel/commit/5b1a843e40|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve] mongodb connector v2 add source query capability (#3697)|https://github.com/apache/seatunnel/commit/8a7fe6fcb6|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][MongoDB] Unified exception for MongoDB source & sink connector (#3522)|https://github.com/apache/seatunnel/commit/5af632e32b|2.3.0| |[Feature][Connector V2] expose configurable options in MongoDB (#3347)|https://github.com/apache/seatunnel/commit/ffd5778efc|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[Improve][Connector-V2] Improve mongodb connector (#2778)|https://github.com/apache/seatunnel/commit/efbf793fa5|2.2.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Feature][Connector-V2] Add mongodb connecter sink (#2694)|https://github.com/apache/seatunnel/commit/51c28a3387|2.2.0-beta| |[Feature][Connector-V2] Add mongodb connecter source (#2596)|https://github.com/apache/seatunnel/commit/3ee8a8a619|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-neo4j.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] neo4j options (#9164)|https://github.com/apache/seatunnel/commit/1eb81e7f88|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Improve][connector-V2-Neo4j]Supports neo4j sink batch write and update docs (#4841)|https://github.com/apache/seatunnel/commit/580276a8bd|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Neo4j] Unified exception for Neo4j source & sink connector (#3565)|https://github.com/apache/seatunnel/commit/58584eefb1|2.3.0| |[Feature][Connector][Neo4j] expose configurable options in Neo4j (#3342)|https://github.com/apache/seatunnel/commit/efa04b38fe|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Feature][Connector-v2] Neo4j source connector (#2777)|https://github.com/apache/seatunnel/commit/38b0daf8b7|2.3.0| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-v2] Neo4j sink connector (#2434)|https://github.com/apache/seatunnel/commit/950b27d132|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-openmldb.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] openmldb options (#9166)|https://github.com/apache/seatunnel/commit/d324fc59a4|2.3.11| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Conenctor-V2] Add openmldb source connector (#3313)|https://github.com/apache/seatunnel/commit/e68ecf7bef|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-paimon.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Connectors-v2] Clean up temporary files for paimon sink (#9819)|https://github.com/apache/seatunnel/commit/c43d57de31| dev | |[Feature][Connector-v2] Support multi paimon source (#9759)|https://github.com/apache/seatunnel/commit/0d52102241|2.3.12| |[Chore] fix typos filed -> field (#9757)|https://github.com/apache/seatunnel/commit/e3e1c67d29|2.3.12| |[Feature][connector-paimon] Paimon connector supports paimon privilege (#9722)|https://github.com/apache/seatunnel/commit/b2bb2f8d78|2.3.12| |[Improve][Core] Update apache common to apache common lang3 (#9694)|https://github.com/apache/seatunnel/commit/6e5737c1ec|2.3.12| |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[fix][connectors-v2] repeated commit cause task exceptions (#9665)|https://github.com/apache/seatunnel/commit/085023ad0d|2.3.12| |[Improve][Connector-V2] Support like predicate pushdown in paimon (#9653)|https://github.com/apache/seatunnel/commit/9e01c84e76|2.3.12| |[Feature][Connectors-v2]Paimon version upgrade to 1.1.1 (#8074)|https://github.com/apache/seatunnel/commit/96b26a68dc|2.3.12| |[Fix][Connectors-v2] fix dynamic bucket for paimon sink (#9595)|https://github.com/apache/seatunnel/commit/d29a531a48|2.3.12| |[Feature][Connector-V2] Support like predicate pushdown in paimon (#9484)|https://github.com/apache/seatunnel/commit/a19720ccf6|2.3.12| |[Fix][Connector-V2] Update waitCompaction value for batch mode and writeonly (#9479)|https://github.com/apache/seatunnel/commit/63993a6197|2.3.12| |[Future][Connector-V2]Support the automatic creation of non-primary key table (#9219)|https://github.com/apache/seatunnel/commit/93e539cc9f|2.3.12| |[Fix][Connector-V2] Optimize Paimon DECIMAL type check to prevent precision loss (#9480)|https://github.com/apache/seatunnel/commit/c114682a6b|2.3.12| |[Bug][Connector-V2] fix NPE when decimal type precision is incompatible for Paimon (#9452)|https://github.com/apache/seatunnel/commit/37762c93f0|2.3.12| |[feature][connectors-v2] Support in predicate pushdown in paimon (#9379)|https://github.com/apache/seatunnel/commit/1ec43755d5|2.3.12| |[Improve][Connector-V2] Fix the word misspellings for paimon connector (#9332)|https://github.com/apache/seatunnel/commit/ba7f5c9e30|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[improve] paimon options (#9167)|https://github.com/apache/seatunnel/commit/b0889305c2|2.3.11| |[Fix][Paimon] nullable and comment attribute was lost during automatic table creation (#9020)|https://github.com/apache/seatunnel/commit/eb54fdd52c|2.3.11| |[Feature][Connector-V2] Support between predicate pushdown in paimon (#8962)|https://github.com/apache/seatunnel/commit/3b141cf621|2.3.10| |[Feature][Connector-V2] Suppor Time type in paimon connector (#8880)|https://github.com/apache/seatunnel/commit/9f1e590091|2.3.10| |[Feature][Paimon] Customize the hadoop user (#8888)|https://github.com/apache/seatunnel/commit/2657626f93|2.3.10| |[Improve][Connector-v2][Paimon]PaimonCatalog close error message update (#8640)|https://github.com/apache/seatunnel/commit/48253da8d6|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][Connector-v2] Support checkpoint in batch mode for paimon sink (#8333)|https://github.com/apache/seatunnel/commit/f22d4ebd4d|2.3.9| |[Feature][Connector-v2] Support schema evolution for paimon sink (#8211)|https://github.com/apache/seatunnel/commit/57190e2a3b|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Connector-v2] Support S3 filesystem of paimon connector (#8036)|https://github.com/apache/seatunnel/commit/e2a4772933|2.3.9| |[Feature][transform] transform support explode (#7928)|https://github.com/apache/seatunnel/commit/132278c06a|2.3.9| |[Feature][Connector-V2] Piamon Sink supports changelog-procuder is lookup and full-compaction mode (#7834)|https://github.com/apache/seatunnel/commit/c0f27c2f76|2.3.9| |[Fix][connector-v2]Fix Paimon table connector Error log information. (#7873)|https://github.com/apache/seatunnel/commit/a3b49e6354|2.3.9| |[Improve][Connector-v2] Use checkpointId as the commit's identifier instead of the hash for streaming write of paimon sink (#7835)|https://github.com/apache/seatunnel/commit/c7a384af2b|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Fix][Connecotr-V2] Fix paimon dynamic bucket tale in primary key is not first (#7728)|https://github.com/apache/seatunnel/commit/dc7f695537|2.3.8| |[Improve][Connector-v2] Remove useless code and add changelog doc for paimon sink (#7748)|https://github.com/apache/seatunnel/commit/846d876dc2|2.3.8| |[Hotfix][Connector-V2] Release resources even the task is crashed for paimon sink (#7726)|https://github.com/apache/seatunnel/commit/5ddf8d461e|2.3.8| |[Fix][Connector-V2] Fix paimon e2e error (#7721)|https://github.com/apache/seatunnel/commit/61d1964361|2.3.8| |[Feature][Connector-Paimon] Support dynamic bucket splitting improves Paimon writing efficiency (#7335)|https://github.com/apache/seatunnel/commit/bc0326cba8|2.3.8| |[Feature][Connector-v2] Support streaming read for paimon (#7681)|https://github.com/apache/seatunnel/commit/4a2e27291c|2.3.8| |[Hotfix][Seatunnel-common] Fix the CommonError msg for paimon sink (#7591)|https://github.com/apache/seatunnel/commit/d1f5db9257|2.3.8| |[Feature][CONNECTORS-V2-Paimon] Paimon Sink supported truncate table (#7560)|https://github.com/apache/seatunnel/commit/4f3df22124|2.3.8| |[Improve][Connector-v2] Improve the exception msg in case-sensitive case for paimon sink (#7549)|https://github.com/apache/seatunnel/commit/7d31e5668c|2.3.8| |[Hotfix][Connector-V2] Fixed lost data precision for decimal data types (#7527)|https://github.com/apache/seatunnel/commit/df210ea73d|2.3.8| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |The isNullable attribute is true when the primary key field in the Paimon table converts the Column object. #7231 (#7242)|https://github.com/apache/seatunnel/commit/b0fe432e99|2.3.6| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Paimon]support projection for paimon source (#6343)|https://github.com/apache/seatunnel/commit/6c1577267f|2.3.6| |[Improve][Paimon] Add check for the base type between source and sink before write. (#6953)|https://github.com/apache/seatunnel/commit/d56d64fc04|2.3.6| |[Improve][Connector-V2] Improve the paimon source (#6887)|https://github.com/apache/seatunnel/commit/658643ae53|2.3.6| |[Hotfix][Connector-V2] Close the tableWrite when task is close (#6897)|https://github.com/apache/seatunnel/commit/23a744b9b2|2.3.6| |[Fix][Connector-V2] Field information lost during Paimon DataType and SeaTunnel Column conversion (#6767)|https://github.com/apache/seatunnel/commit/6cf6e41da7|2.3.6| |[Improve][Connector-V2] Support hive catalog for paimon sink (#6833)|https://github.com/apache/seatunnel/commit/4969c91dc4|2.3.6| |[Hotfix][Connector-V2] Fix the batch write with paimon (#6865)|https://github.com/apache/seatunnel/commit/9ec971d942|2.3.6| |[Feature][Doris] Add Doris type converter (#6354)|https://github.com/apache/seatunnel/commit/5189991843|2.3.6| |[Improve][Connector-V2] Support hadoop ha and kerberos for paimon sink (#6585)|https://github.com/apache/seatunnel/commit/20b62f3bf3|2.3.5| |[Feature][Paimon] Support specify paimon table write properties, partition keys and primary keys (#6535)|https://github.com/apache/seatunnel/commit/2b1234c7ae|2.3.5| |[Feature][Connector-V2] Support multi-table sink feature for paimon #5652 (#6449)|https://github.com/apache/seatunnel/commit/b0abbd2d89|2.3.5| |[Feature][Connectors-v2-Paimon] Adaptation Paimon 0.6 Version (#6061)|https://github.com/apache/seatunnel/commit/b32df930e9|2.3.4| |[Fix] [Connectors-v2-Paimon] Flink table store failed to prepare commit (#6057)|https://github.com/apache/seatunnel/commit/c8dcefc3be|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Hotfix][Connector-V2][Paimon] Bump paimon-bundle version to 0.4.0-incubating (#5219)|https://github.com/apache/seatunnel/commit/2917542bfa|2.3.3| |[Improve] Documentation and partial word optimization. (#4936)|https://github.com/apache/seatunnel/commit/6e8de0e2a6|2.3.3| |[Connector-V2][Paimon] Introduce paimon connector (#4178)|https://github.com/apache/seatunnel/commit/da507bbe0e|2.3.2|
    ================================================ FILE: docs/zh/connectors/changelog/connector-prometheus.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-pulsar.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][API] Optimize the enumerator API semantics and reduce lock calls at the connector level (#9671)|https://github.com/apache/seatunnel/commit/9212a77140|2.3.12| |[improve] pulsar options (#9180)|https://github.com/apache/seatunnel/commit/26a2160c80|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][API] Make sure the table name in TablePath not be null (#7252)|https://github.com/apache/seatunnel/commit/764d8b0bc8|2.3.7| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[PulsarSource]Improve pulsar throughput performance. (#6234)|https://github.com/apache/seatunnel/commit/37461f4f3e|2.3.4| |[Feature][Connector-v2][PulsarSink]Add Pulsar Sink Connector. (#4382)|https://github.com/apache/seatunnel/commit/543d2c5086|2.3.4| |[Chore] Remove useless DeserializationFormatFactory and its implement (#5880)|https://github.com/apache/seatunnel/commit/f0511544ff|2.3.4| |fix: update IDENTIFIER = Pulsar for pulsar-datasource on project:seatunnel-web (#5852)|https://github.com/apache/seatunnel/commit/3b6de3743e|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |[Feature][Json-format] support read format for pulsar (#4111)|https://github.com/apache/seatunnel/commit/7d61ae93e7|2.3.2| |[hotfix][pulsar] Fix the bug that can't consume messages all the time. (#4125)|https://github.com/apache/seatunnel/commit/a6705cc5bf|2.3.2| |[Feature] add cdc multiple table support & fix zeta bug|https://github.com/apache/seatunnel/commit/533ff2c2fa|2.3.1| |[hotfix][pulsar] PulsarSource consumer ack exception. (#4237)|https://github.com/apache/seatunnel/commit/9725d675da|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Improve][Connector-v2][Pulsar] Set the name of the pulsar consumption thread. (#4182)|https://github.com/apache/seatunnel/commit/e567203f7d|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Bug][Connector-v2][PulsarSource]Fix pulsar option topic-pattern bug. (#3989)|https://github.com/apache/seatunnel/commit/aee2c580ea|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Improve][Connector-V2][Pulsar] Unified exception for Pulsar source &… (#3590)|https://github.com/apache/seatunnel/commit/4fe9323419|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Hotfix][Connector-V2][Pulsar] fix conditional options (#3504)|https://github.com/apache/seatunnel/commit/0066affacf|2.3.0| |[Feature][Connector][pulsar] expose configurable options in Pulsar (#3341)|https://github.com/apache/seatunnel/commit/200faa7c29|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[chore] fix pulsar consumer comment error (#3356)|https://github.com/apache/seatunnel/commit/91e632c526|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[hotfix][connector][pulsar] Fix not being able to mark #noMoreNewSplits when restoring (#2945)|https://github.com/apache/seatunnel/commit/5ad69076b3|2.3.0-beta| |Move Handover to common module (#2877)|https://github.com/apache/seatunnel/commit/d94a874bcb|2.3.0-beta| |[hotfix][connector-v2] fix pulsar source exceptions (#2820)|https://github.com/apache/seatunnel/commit/8ff0ba7015|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[SeaTunnel]Simply seatunnel package pipeline. (#2563)|https://github.com/apache/seatunnel/commit/9d88b6221a|2.2.0-beta| |[Improve][Connector-V2] Pulsar support user-defined schema (#2436)|https://github.com/apache/seatunnel/commit/16cabe6a35|2.2.0-beta| |[improve][UT] Upgrade junit to 5.+ (#2305)|https://github.com/apache/seatunnel/commit/362319ff3e|2.2.0-beta| |StateT of SeaTunnelSource should extend `Serializable` (#2214)|https://github.com/apache/seatunnel/commit/8c426ef850|2.2.0-beta| |[doc][connector-v2] pulsar source options doc (#2128)|https://github.com/apache/seatunnel/commit/59ce8a2b32|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-qdrant.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-rabbitmq.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Fix][connector-rabbitmq] Set default value for durable, exclusive and auto-delete (#9631)|https://github.com/apache/seatunnel/commit/5f9492e62a|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] rabbit mq options (#8740)|https://github.com/apache/seatunnel/commit/4eec9be012|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Feature][Rabbitmq] Allow configuration of queue durability and deletion policy (#7365)|https://github.com/apache/seatunnel/commit/aabfc8eb78|2.3.8| |[Hotfix][connector-v2-rabbit] fix rabbit checkpoint exception in Flink mode (#7108)|https://github.com/apache/seatunnel/commit/423a7b142b|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Bugfix][connector-v2][rabbitmq] Fix reduplicate ack msg bug and code style (#4842)|https://github.com/apache/seatunnel/commit/985fb6642a|2.3.2| |[Hotfix][E2E] Fix RabbitmqIT (#4593)|https://github.com/apache/seatunnel/commit/9bd5403d71|2.3.2| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Feature][API & Connector & Doc] add parallelism and column projection interface (#3829)|https://github.com/apache/seatunnel/commit/b9164b8ba1|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| | [Feature][Connector-V2][RabbitMQ] Add RabbitMQ source & sink connector (#3312)|https://github.com/apache/seatunnel/commit/4b12691a8d|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-redis.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve][Connector-V2] Use key_field_name option when reading Redis hash data (#9642)|https://github.com/apache/seatunnel/commit/5d214a7305|2.3.12| |[Feature][Redis] Add redis key into the result record (#9574)|https://github.com/apache/seatunnel/commit/6e8b7c5da5|2.3.12| |[Fix][Connector-Redis] Redis did not write successfully, but the task did not fail (#9055)|https://github.com/apache/seatunnel/commit/07510ed937|2.3.11| |[hotfix][redis] fix npe cause by null host parameter (#8881)|https://github.com/apache/seatunnel/commit/7bd5865165|2.3.10| |[Improve][Redis] Optimized Redis connection params (#8841)|https://github.com/apache/seatunnel/commit/e56f06cdf0|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] update Redis connector config option (#8631)|https://github.com/apache/seatunnel/commit/f1c313eea6|2.3.10| |[Feature][Redis] Flush data when the time reaches checkpoint.interval and update test case (#8308)|https://github.com/apache/seatunnel/commit/e15757bcd7|2.3.9| |Revert "[Feature][Redis] Flush data when the time reaches checkpoint interval" and "[Feature][CDC] Add 'schema-changes.enabled' options" (#8278)|https://github.com/apache/seatunnel/commit/fcb2938286|2.3.9| |[Feature][Redis] Flush data when the time reaches checkpoint.interval (#8198)|https://github.com/apache/seatunnel/commit/2e24941e6a|2.3.9| |[Hotfix] Fix redis sink NPE (#8171)|https://github.com/apache/seatunnel/commit/6b9074e769|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Feature] [Connector-Redis] Redis connector support delete data (#7994)|https://github.com/apache/seatunnel/commit/02a35c3979|2.3.9| |[Improve][Connector-V2] Redis support custom key and value (#7888)|https://github.com/apache/seatunnel/commit/ef2c3c7283|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[improve][Redis]Redis scan command supports versions 5, 6, 7 (#7666)|https://github.com/apache/seatunnel/commit/6e70cbe334|2.3.8| |[Improve][Connector] Add multi-table sink option check (#7360)|https://github.com/apache/seatunnel/commit/2489f6446b|2.3.7| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Improve][Redis] Redis reader use scan cammnd instead of keys, single mode reader/writer support batch (#7087)|https://github.com/apache/seatunnel/commit/be37f05c07|2.3.6| |[Feature][Kafka] Support multi-table source read (#5992)|https://github.com/apache/seatunnel/commit/60104602d1|2.3.6| |[Improve][Connector-V2]Support multi-table sink feature for redis (#6314)|https://github.com/apache/seatunnel/commit/fed89ae3fc|2.3.5| |[Feature][Core] Upgrade flink source translation (#5100)|https://github.com/apache/seatunnel/commit/5aabb14a94|2.3.4| |[Feature][Connector-V2] Support TableSourceFactory/TableSinkFactory on redis (#5901)|https://github.com/apache/seatunnel/commit/e84dcb8c10|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector-v2][Redis] Redis support select db (#5570)|https://github.com/apache/seatunnel/commit/77fbbbd0ee|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Feature][Connector-v2][RedisSink]Support redis to set expiration time. (#4975)|https://github.com/apache/seatunnel/commit/b5321ff1d2|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Redis] Unified exception for redis source & sink exception (#3517)|https://github.com/apache/seatunnel/commit/205f782585|2.3.0| |options in conditional need add to required or optional options (#3501)|https://github.com/apache/seatunnel/commit/51d5bcba10|2.3.0| |[feature][api] add option validation for the ReadonlyConfig (#3417)|https://github.com/apache/seatunnel/commit/4f824fea36|2.3.0| |[Feature][Redis Connector V2] Add Redis Connector Option Rules & Improve Redis Connector doc (#3320)|https://github.com/apache/seatunnel/commit/1c10aacb30|2.3.0| |[Connector-V2] [ElasticSearch] Add ElasticSearch Source/Sink Factory (#3325)|https://github.com/apache/seatunnel/commit/38254e3f26|2.3.0| |[Improve][Connector-V2][Redis] Support redis cluster connection & user authentication (#3188)|https://github.com/apache/seatunnel/commit/c7275a49cc|2.3.0| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[Feature][Connector-V2] Add redis sink connector (#2647)|https://github.com/apache/seatunnel/commit/71a9e4b019|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Add redis source connector (#2569)|https://github.com/apache/seatunnel/commit/405f7d6f99|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-rocketmq.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-s3-redshift.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-selectdb-cloud.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-sensorsdata.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-sentry.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] sentry options (#9261)|https://github.com/apache/seatunnel/commit/4a2f3fa915|2.3.11| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Sentry] Unified exception for sentry sink connector (#3513)|https://github.com/apache/seatunnel/commit/94b472b806|2.3.0| |[Connector] [Dependency] Add Miss Dependency Cassandra And Change Kudu Plugin Name (#3432)|https://github.com/apache/seatunnel/commit/6ac6a0a0cd|2.3.0| |[Feature][Sentry Sink V2] Add Sentry Sink Option Rules (#3318)|https://github.com/apache/seatunnel/commit/850f483816|2.3.0| |[Feature][Connector-V2] Add sentry sink connector #2244 (#2584)|https://github.com/apache/seatunnel/commit/9fd40390a7|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-slack.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] Slack connector options (#8738)|https://github.com/apache/seatunnel/commit/eb706743fe|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Feature][Connector-V2][Slack] Add Slack sink connector (#3226)|https://github.com/apache/seatunnel/commit/7a836f2d44|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-sls.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-socket.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[improve] socket options (#9517)|https://github.com/apache/seatunnel/commit/af83a302cf|2.3.12| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector] add get source method to all source connector (#3846)|https://github.com/apache/seatunnel/commit/417178fb84|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][Socket] Unified exception for socket source & sink connector (#3511)|https://github.com/apache/seatunnel/commit/581292f210|2.3.0| |[feature][connector][socket] Add Socket Connector Option Rules (#3317)|https://github.com/apache/seatunnel/commit/b85317bcbe|2.3.0| |[Improve][all] change Log to @Slf4j (#3001)|https://github.com/apache/seatunnel/commit/6016100f12|2.3.0-beta| |[DEV][Api] Replace SeaTunnelContext with JobContext and remove singleton pattern (#2706)|https://github.com/apache/seatunnel/commit/cbf82f755c|2.2.0-beta| |[#2606]Dependency management split (#2630)|https://github.com/apache/seatunnel/commit/fc047be69b|2.2.0-beta| |[Feature][Connector-V2] Socket Connector Sink (#2549)|https://github.com/apache/seatunnel/commit/94f4600a4e|2.2.0-beta| |[api-draft][Optimize] Optimize module name (#2062)|https://github.com/apache/seatunnel/commit/f79e3112b1|2.2.0-beta|
    ================================================ FILE: docs/zh/connectors/changelog/connector-starrocks.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Feature][Core] Add plugin directory support for each connector (#9650)|https://github.com/apache/seatunnel/commit/4beb2b9336|2.3.12| |[Fix][Doc] Update StarRocks doc change schema necessity to true (#9656)|https://github.com/apache/seatunnel/commit/45f8ac6d1d|2.3.12| |[improve] jdbc options (#9541)|https://github.com/apache/seatunnel/commit/d041e5fb32|2.3.12| |[Fix][Connector-V2] Fix starrocks decimal column definition generation(#9470) (#9471)|https://github.com/apache/seatunnel/commit/64b8f1752e|2.3.12| |[Bugfix][Starrocks] Fix starrocks batch data exceeds the maximum limit (#9256)|https://github.com/apache/seatunnel/commit/84634a4d1f|2.3.11| |[Improve][Starrocks] Catch lable already exception (#9222)|https://github.com/apache/seatunnel/commit/b6fc222c0a|2.3.11| |[Feature][Transform] Support define sink column type (#9114)|https://github.com/apache/seatunnel/commit/ab7119e507|2.3.11| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Fix][Connector-V2] Fixed missing timestamp accuracy of starrocks connector (#9096)|https://github.com/apache/seatunnel/commit/02254b9c0e|2.3.11| |[Fix][Connector-V2] Fix StarRocksCatalogTest#testCatalog() NPE (#8987)|https://github.com/apache/seatunnel/commit/53f0a9eb52|2.3.10| |[Improve][Connector-V2] Random pick the starrocks fe address which can be connected (#8898)|https://github.com/apache/seatunnel/commit/bef76078f9|2.3.10| |[Feature][Connector-v2] Support multi starrocks source (#8789)|https://github.com/apache/seatunnel/commit/26b5529aaf|2.3.10| |[Fix][Connector-V2] Fix possible data loss in scenarios of request_tablet_size is less than the number of BUCKETS (#8768)|https://github.com/apache/seatunnel/commit/3c6f216135|2.3.10| |[Fix][Connector-V2]Fix Descriptions for CUSTOM_SQL in Connector (#8778)|https://github.com/apache/seatunnel/commit/96b610eb7e|2.3.10| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[improve] add StarRocks options (#8639)|https://github.com/apache/seatunnel/commit/da8d9cbd35|2.3.10| |[Fix][Connector-V2] fix starRocks automatically creates tables with comment (#8568)|https://github.com/apache/seatunnel/commit/c4cb1fc4a3|2.3.10| |[Fix][Connector-V2] Fixed adding table comments (#8514)|https://github.com/apache/seatunnel/commit/edca75b0d6|2.3.10| |[Feature][Connector-V2] Starrocks implements multi table sink (#8467)|https://github.com/apache/seatunnel/commit/55eebfa8af|2.3.9| |[Improve][Connector-V2] Add pre-check starrocks version before exeucte alter table field name (#8237)|https://github.com/apache/seatunnel/commit/c24e3b12ba|2.3.9| |[Fix][Connector-starrocks] Fix drop column bug for starrocks (#8216)|https://github.com/apache/seatunnel/commit/082814da1f|2.3.9| |[Feature][Core] Support read arrow data (#8137)|https://github.com/apache/seatunnel/commit/4710ea0f8d|2.3.9| |[Feature][Clickhouse] Support sink savemode (#8086)|https://github.com/apache/seatunnel/commit/e6f92fd79b|2.3.9| |[Feature][Connector-V2] StarRocks-sink support schema evolution (#8082)|https://github.com/apache/seatunnel/commit/d33b0da8ab|2.3.9| |[Improve][dist]add shade check rule (#8136)|https://github.com/apache/seatunnel/commit/51ef800016|2.3.9| |[Improve][Connector-V2] Add doris/starrocks create table with comment (#7847)|https://github.com/apache/seatunnel/commit/207b8c16fd|2.3.9| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| |[Improve][API] Move catalog open to SaveModeHandler (#7439)|https://github.com/apache/seatunnel/commit/8c2c5c79a1|2.3.8| |[Improve][Connector-V2] Reuse connection in StarRocksCatalog (#7342)|https://github.com/apache/seatunnel/commit/8ee129d20f|2.3.8| |[Improve][Connector-V2] Remove system table limit (#7391)|https://github.com/apache/seatunnel/commit/adf888e008|2.3.8| |[Improve][Connector-V2] Close all ResultSet after used (#7389)|https://github.com/apache/seatunnel/commit/853e973212|2.3.8| |[Feature][Core] Support using upstream table placeholders in sink options and auto replacement (#7131)|https://github.com/apache/seatunnel/commit/c4ca74122c|2.3.6| |[Fix][Connector-V2] Fix starrocks Content-Length header already present error (#7034)|https://github.com/apache/seatunnel/commit/a485a74eff|2.3.6| |[Feature][Connector-V2]Support StarRocks Fe Node HA|https://github.com/apache/seatunnel/commit/9c36c45819|2.3.6| |[Fix][Connector-v2] Fix the sql statement error of create table for doris and starrocks (#6679)|https://github.com/apache/seatunnel/commit/88263cd69f|2.3.6| |[Fix][StarRocks] Fix NPE when upstream catalogtable table path only have table name part (#6540)|https://github.com/apache/seatunnel/commit/5795b265cc|2.3.5| |[Fix][Connector-V2] Fixed doris/starrocks create table sql parse error (#6580)|https://github.com/apache/seatunnel/commit/f2ed1fbde0|2.3.5| |[Fix][Connector-V2] Fix connector support SPI but without no args constructor (#6551)|https://github.com/apache/seatunnel/commit/5f3c9c36a5|2.3.5| |[Improve] Add SaveMode log of process detail (#6375)|https://github.com/apache/seatunnel/commit/b0d70ce224|2.3.5| |[Improve][Connector-V2] Support TableSourceFactory on StarRocks (#6498)|https://github.com/apache/seatunnel/commit/aded56299c|2.3.5| |[Improve] StarRocksSourceReader use the existing client (#6480)|https://github.com/apache/seatunnel/commit/1a02c571a9|2.3.5| |[Improve][API] Unify type system api(data & type) (#5872)|https://github.com/apache/seatunnel/commit/b38c7edcc9|2.3.5| |[Feature][Connector] add starrocks save_mode (#6029)|https://github.com/apache/seatunnel/commit/66b0f1e1d2|2.3.4| |[Feature] Add unsupported datatype check for all catalog (#5890)|https://github.com/apache/seatunnel/commit/b9791285a0|2.3.4| |[Improve] StarRocks support create table template with unique key (#5905)|https://github.com/apache/seatunnel/commit/25b01125e4|2.3.4| |[Improve][StarRocksSink] add http socket timeout. (#5918)|https://github.com/apache/seatunnel/commit/febdb262b6|2.3.4| |[Improve] Support create varchar field type in StarRocks (#5911)|https://github.com/apache/seatunnel/commit/6025895167|2.3.4| |[Improve]Change System.out.println to log output. (#5912)|https://github.com/apache/seatunnel/commit/bbedb07a9c|2.3.4| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |[Improve][Connector] Add field name to `DataTypeConvertor` to improve error message (#5782)|https://github.com/apache/seatunnel/commit/ab60790f0d|2.3.4| |[feature][connector-jdbc]Add Save Mode function and Connector-JDBC (MySQL) connector has been realized (#5663)|https://github.com/apache/seatunnel/commit/eff17ccbe5|2.3.4| |[Improve] Add default implement for `SeaTunnelSink::setTypeInfo` (#5682)|https://github.com/apache/seatunnel/commit/86cba87450|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] Refactor CatalogTable and add `SeaTunnelSource::getProducedCatalogTables` (#5562)|https://github.com/apache/seatunnel/commit/41173357f8|2.3.4| |[Hotfix][Connector-V2][StarRocks] fix starrocks template sql parser #5071 (#5332)|https://github.com/apache/seatunnel/commit/23d79b0d17|2.3.4| |[Improve] [Connector-V2] Remove scheduler in StarRocks sink (#5269)|https://github.com/apache/seatunnel/commit/cb7b794914|2.3.4| |[Improve][CheckStyle] Remove useless 'SuppressWarnings' annotation of checkstyle. (#5260)|https://github.com/apache/seatunnel/commit/51c0d709ba|2.3.4| |[Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284)|https://github.com/apache/seatunnel/commit/ed5eadcf73|2.3.3| |Fix StarRocksJsonSerializer will transform array/map/row to string (#5281)|https://github.com/apache/seatunnel/commit/f941953774|2.3.3| |[Improve] Improve savemode api (#4767)|https://github.com/apache/seatunnel/commit/4acd370d48|2.3.3| |[Improve] [Connector-V2] Improve StarRocks Auto Create Table To Support Use Primary Key Template In Field (#4487)|https://github.com/apache/seatunnel/commit/e601cd4c37|2.3.2| |Revert "[Improve][Catalog] refactor catalog (#4540)" (#4628)|https://github.com/apache/seatunnel/commit/2d1933195d|2.3.2| |[hotfix][starrocks] fix error on get starrocks source typeInfo (#4619)|https://github.com/apache/seatunnel/commit/f7b094f9eb|2.3.2| |[Improve][Catalog] refactor catalog (#4540)|https://github.com/apache/seatunnel/commit/b0a701cb83|2.3.2| |[Improve] [Connector-V2] Throw StarRocks Serialize Error To Client (#4484)|https://github.com/apache/seatunnel/commit/e2c107323b|2.3.2| |[Improve] [Connector-V2] Improve StarRocks Serialize Error Message (#4458)|https://github.com/apache/seatunnel/commit/465e75cbf5|2.3.2| |[Hotfix][Zeta] Adapt StarRocks With Multi-Table And Single-Table Mode (#4324)|https://github.com/apache/seatunnel/commit/c11c171d36|2.3.1| |[improve][zeta] fix zeta bugs|https://github.com/apache/seatunnel/commit/3a82e8b39f|2.3.1| |[Improve] [Zeta] Improve Client Job Info Message|https://github.com/apache/seatunnel/commit/56febf0118|2.3.1| |[Fix] [Connector-V2] Fix StarRocksSink Without Format Field In Header|https://github.com/apache/seatunnel/commit/463ae6437e|2.3.1| |[Improve] Support StarRocksCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/d00ced6ecd|2.3.1| |[Improve] Support MySqlCatalog Use JDBC URL With Custom Suffix|https://github.com/apache/seatunnel/commit/210d0ff1f8|2.3.1| |[Improve] Change StarRocks Sink Default Format To Json|https://github.com/apache/seatunnel/commit/8703357830|2.3.1| |[Fix] Fix StarRocks Default Url Can't Use|https://github.com/apache/seatunnel/commit/67c45d353a|2.3.1| |[hotfix] fixed schema options import error|https://github.com/apache/seatunnel/commit/656805f2df|2.3.1| |[chore] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/291214ad6f|2.3.1| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[Fix] Fix StarRocks Default Url Can't Use (#4229)|https://github.com/apache/seatunnel/commit/ed74d11090|2.3.1| |[Bug] Remove StarRocks Auto Creat Table Default Value (#4220)|https://github.com/apache/seatunnel/commit/80b5cd40ae|2.3.1| |[Feature] Add SaveMode For StarRocks (#4217)|https://github.com/apache/seatunnel/commit/0674f10a53|2.3.1| |[Improve] Improve StarRocks Catalog Base Url (#4215)|https://github.com/apache/seatunnel/commit/6632a40473|2.3.1| |[Improve] Improve StarRocks Sink Config (#4212)|https://github.com/apache/seatunnel/commit/8d5712c1db|2.3.1| |[Hotfix][Zeta] keep deleteCheckpoint method synchronized (#4209)|https://github.com/apache/seatunnel/commit/061f9b5872|2.3.1| |[Improve] Improve StarRocks Auto Create Table (#4208)|https://github.com/apache/seatunnel/commit/bc9cd6bf69|2.3.1| |[hotfix][zeta] fix zeta multi-table parser error (#4193)|https://github.com/apache/seatunnel/commit/98f2ad0c19|2.3.1| |[feature][starrocks] add StarRocks factories (#4191)|https://github.com/apache/seatunnel/commit/c485d887ec|2.3.1| |[Feature] Change StarRocks CreatTable Template (#4184)|https://github.com/apache/seatunnel/commit/4cf07f3beb|2.3.1| |[Feature][Connector-V2] StarRocks source connector (#3679)|https://github.com/apache/seatunnel/commit/9681173b10|2.3.1| |[Improve] [Connector-V2] [StarRocks] Starrocks Support Auto Create Table (#4177)|https://github.com/apache/seatunnel/commit/7e0008e6fb|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Feature][Connector-v2][StarRocks] Support write cdc changelog event(INSERT/UPDATE/DELETE) (#3865)|https://github.com/apache/seatunnel/commit/8e3d158c03|2.3.1| |[Improve] [Connector-V2] Change Connector Custom Config Prefix To Map (#3719)|https://github.com/apache/seatunnel/commit/ef1b8b1bb5|2.3.1| |[Improve][Connector-V2][StarRocks] Unified exception for StarRocks source and sink (#3593)|https://github.com/apache/seatunnel/commit/612d0297a0|2.3.0| |[Improve][Connector-V2][StarRocks] Delete the Mapper may not be used (#3579)|https://github.com/apache/seatunnel/commit/1e868ecf28|2.3.0| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][StarRocks]Add StarRocks connector option rules (#3402)|https://github.com/apache/seatunnel/commit/5d187f69b7|2.3.0| |[Bugfix][Connector-V2][StarRocks]Fix StarRocks StreamLoad retry bug and fix doc (#3406)|https://github.com/apache/seatunnel/commit/071f9aa055|2.3.0| |[Feature][Connector-V2] Starrocks sink connector (#3164)|https://github.com/apache/seatunnel/commit/3e6caf7053|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-tablestore.md ================================================
    Change Log | Change | Commit | Version | | --- | --- | --- | |[Improve] table_store options (#9515)|https://github.com/apache/seatunnel/commit/145b68793f|2.3.12| |[Feature][Checkpoint] Add check script for source/sink state class serialVersionUID missing (#9118)|https://github.com/apache/seatunnel/commit/4f5adeb1c7|2.3.11| |[Improve] restruct connector common options (#8634)|https://github.com/apache/seatunnel/commit/f3499a6eeb|2.3.10| |[Feature][Restapi] Allow metrics information to be associated to logical plan nodes (#7786)|https://github.com/apache/seatunnel/commit/6b7c53d03c|2.3.9| | [Feature][Connector-V2][Tablestore] Support Source connector for Tablestore #7448 (#7467)|https://github.com/apache/seatunnel/commit/a7ca51b585|2.3.8| |[Improve][Common] Introduce new error define rule (#5793)|https://github.com/apache/seatunnel/commit/9d1b2582b2|2.3.4| |[Improve] Remove use `SeaTunnelSink::getConsumedType` method and mark it as deprecated (#5755)|https://github.com/apache/seatunnel/commit/8de7408100|2.3.4| |Support config column/primaryKey/constraintKey in schema (#5564)|https://github.com/apache/seatunnel/commit/eac76b4e50|2.3.4| |[Improve] [Connector-V2] Remove scheduler in Tablestore sink (#5272)|https://github.com/apache/seatunnel/commit/8d6b07e466|2.3.3| |Merge branch 'dev' into merge/cdc|https://github.com/apache/seatunnel/commit/4324ee1912|2.3.1| |[Improve][Project] Code format with spotless plugin.|https://github.com/apache/seatunnel/commit/423b583038|2.3.1| |[improve][api] Refactoring schema parse (#4157)|https://github.com/apache/seatunnel/commit/b2f573a13e|2.3.1| |[Improve][build] Give the maven module a human readable name (#4114)|https://github.com/apache/seatunnel/commit/d7cd601051|2.3.1| |[Improve][Project] Code format with spotless plugin. (#4101)|https://github.com/apache/seatunnel/commit/a2ab166561|2.3.1| |[Hotfix][OptionRule] Fix option rule about all connectors (#3592)|https://github.com/apache/seatunnel/commit/226dc6a119|2.3.0| |[Improve][Connector-V2][TableStore] Unified excetion for TableStore sink connector (#3527)|https://github.com/apache/seatunnel/commit/7b264d7004|2.3.0| |[Feature][connector-v2] add tablestore source and sink (#3309)|https://github.com/apache/seatunnel/commit/ebebf0b633|2.3.0|
    ================================================ FILE: docs/zh/connectors/changelog/connector-tdengine.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-typesense.md ================================================ ================================================ FILE: docs/zh/connectors/changelog/connector-web3j.md ================================================ ================================================ FILE: docs/zh/connectors/common-options/sink-common-options.md ================================================ --- sidebar_position: 4 --- # Sink 常用选项 > Sink 连接器常用参数 :::caution 警告 旧的配置名称 `source_table_name` 已经过时,请尽快迁移到新名称 `plugin_input`。 ::: | 名称 | 类型 | 是否需要 | 默认值 | |--------------|--------|------|-----| | plugin_input | string | 否 | - | | parallelism | int | 否 | - | ### plugin_input [string] 当不指定 `plugin_input` 时,当前插件处理配置文件中上一个插件输出的数据集 `dataset` 当指定了 `plugin_input` 时,当前插件正在处理该参数对应的数据集 ### parallelism [int] 当没有指定`parallelism`时,默认使用 env 中的 `parallelism`。 当指定 `parallelism` 时,它将覆盖 env 中的 `parallelism`。 ## Examples ```bash source { FakeSourceStream { parallelism = 2 plugin_output = "fake" field_name = "name,age" } } transform { Filter { plugin_input = "fake" fields = [name] plugin_output = "fake_name" } Filter { plugin_input = "fake" fields = [age] plugin_output = "fake_age" } } sink { Console { plugin_input = "fake_name" } Console { plugin_input = "fake_age" } } ``` > 如果作业只有一个 source 和一个(或零个)transform 和一个 sink ,则不需要为连接器指定 `plugin_input` 和 `plugin_output`。 > 如果 source 、transform 和 sink 中任意运算符的数量大于 1,则必须为作业中的每个连接器指定 `plugin_input` 和 `plugin_output` ================================================ FILE: docs/zh/connectors/common-options/source-common-options.md ================================================ --- sidebar_position: 3 --- # Source 常用选项 > Source connector 的常用参数 :::caution 警告 旧的配置名称 `result_table_name` 已经过时,请尽快迁移到新名称 `plugin_output`。 ::: | 名称 | 类型 | 必填 | 默认值 | 描述 | |---------------|--------|----|-----|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | plugin_output | String | 否 | - | 当未指定 `plugin_output` 时,此插件处理的数据将不会被注册为可由其他插件直接访问的数据集 `(dataStream/dataset)`,或称为临时表 `(table)`。
    当指定了 `plugin_output` 时,此插件处理的数据将被注册为可由其他插件直接访问的数据集 `(dataStream/dataset)`,或称为临时表 `(table)`。此处注册的数据集 `(dataStream/dataset)` 可通过指定 `plugin_input` 直接被其他插件访问。 | | parallelism | Int | 否 | - | 当未指定 `parallelism` 时,默认使用环境中的 `parallelism`。
    当指定了 `parallelism` 时,将覆盖环境中的 `parallelism` 设置。 | # 重要提示 在作业配置中使用 `plugin_output` 时,必须设置 `plugin_input` 参数。 ## 任务示例 ### 简单示例 > 注册一个流或批处理数据源,并在注册时返回表名 `fake_table` ```bash source { FakeSourceStream { plugin_output = "fake_table" } } ``` ### 复杂示例 > 这是将Fake数据源转换并写入到两个不同的目标中 ```bash env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" c_timestamp = "timestamp" c_date = "date" c_map = "map" c_array = "array" c_decimal = "decimal(30, 8)" c_row = { c_row = { c_int = int } } } } } } transform { Sql { plugin_input = "fake" plugin_output = "fake1" # 查询表名必须与字段 'plugin_input' 相同 query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from dual" } # SQL 转换支持基本函数和条件操作 # 但不支持复杂的 SQL 操作,包括:多源表/行 JOIN 和聚合操作等 } sink { Console { plugin_input = "fake1" } Console { plugin_input = "fake" } } ``` ================================================ FILE: docs/zh/connectors/connector-isolated-dependency.md ================================================ # Connector 依赖隔离加载机制 SeaTunnel 提供了针对每个 connector 的依赖隔离加载机制,方便用户管理不同连接器单独的依赖,同时避免依赖冲突并提升系统的可扩展性。 当加载 connector 时,SeaTunnel 会从 `${SEATUNNEL_HOME}` 下的 `plugins/connector-xxx` 目录中,查找并加载该 connector 独立的依赖 jar。这种方式确保了不同 connector 所需的依赖不会相互影响,便于在复杂环境下管理大量 connector。 ## 实现原理 每个 connector 需要将自己的依赖 jar 放置在 `${SEATUNNEL_HOME}/plugins/connector-xxx` 目录下的独立子目录中(需要手动创建)。 子目录名称由 `plugin-mapping` 文件中的 value 值指定。SeaTunnel 启动并加载 connector 时,只会加载对应目录下的 jar,从而实现依赖的隔离。 目前,Zeta 引擎会保证同一个任务不同connector的jar分开加载。其他两个引擎仍然会将所有 connector 的依赖 jar 一起加载,同一个任务放置了不同版本的jar在Spark/Flink环境可能导致依赖冲突。 ## 目录结构示例 - 通过`${SEATUNNEL_HOME}/connectors/plugin-mapping.properties` 获取每个connector对应的文件夹目录命名。 以AmazonDynamodb为例,假设在 `plugin-mapping` 文件中有以下配置: ``` seatunnel.source.AmazonDynamodb = connector-amazondynamodb ``` 则对应的connector依赖目录就是value值 `connector-amazondynamodb`。 最终的目录结构如下所示: ``` SEATUNNEL_HOME/ plugins/ connector-amazondynamodb/ dependency1.jar dependency2.jar connector-xxx/ dependencyA.jar dependencyB.jar ``` ## 限制说明 - 在Zeta引擎中,请确保所有节点的 `${SEATUNNEL_HOME}/plugins/` 目录结构一致。都需要包含相同的子目录和依赖 jar。 - 任何没有以`connector-`开头的目录或者jar都将被当作通用依赖目录处理,所有引擎和connector都会加载此类jar。 - 在Zeta引擎中,可以通过将通用的jar放到 `${SEATUNNEL_HOME}/lib/` 目录下来实现所有 connector 的共享依赖。 ## 验证 - 通过追踪任务日志,确认每个 connector 只加载了其独立的依赖 jar。 ```log 2025-08-13T17:55:48.7732601Z [] 2025-08-13 17:55:47,270 INFO org.apache.seatunnel.plugin.discovery.AbstractPluginDiscovery - find connector jar and dependency for PluginIdentifier{engineType='seatunnel', pluginType='source', pluginName='Jdbc'}: [file:/tmp/seatunnel/plugins/Jdbc/lib/vertica-jdbc-12.0.3-0.jar, file:/tmp/seatunnel/connectors/connector-jdbc-3.0.0-SNAPSHOT-2.12.15.jar] ``` ================================================ FILE: docs/zh/connectors/formats/avro.md ================================================ # Avro 格式 Avro 在流式数据处理管道中非常流行。现在seatunnel在kafka连接器中支持Avro格式 # 怎样用 ## Kafka 使用示例 - 模拟随机生成数据源,并以 Avro 的格式 写入 Kafka 的实例 ```bash env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { row.num = 90 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } plugin_output = "fake" } } sink { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "test_avro_topic_fake_source" format = avro } } ``` - 从 kafka 读取 avro 格式的数据并打印到控制台的示例 ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "test_avro_topic" plugin_output = "kafka_table" start_mode = "earliest" format = avro format_error_handle_way = skip schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Console { plugin_input = "kafka_table" } } ``` ================================================ FILE: docs/zh/connectors/formats/canal-json.md ================================================ # Canal 格式 变更数据捕获格式: 序列化模式、反序列化模式 Canal是一款CDC(变更数据捕获)工具,能够实时捕获MySQL的数据变化并将其流式传输到其他系统中。Canal为变更日志提供了一种统一的格式,并支持使用 JSON 和 protobuf(Canal默认使用protobuf)进行消息的序列化 SeaTunnel 能够解析 Canal 的 JSON 消息,并将其转化为 INSERT/UPDATE/DELETE 消息,进而输入到 SeaTunnel 系统中。这个特性在很多场景下都显得非常有用,例如: 将增量数据从数据库同步到其他系统 审计日志 数据库的实时物化视图 关联维度数据库的变更历史,等等。 SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息编码为 Canal JSON 消息,并将其发送到类似 Kafka 这样的存储中。然而,目前 SeaTunnel 无法将 UPDATE_BEFORE 和 UPDATE_AFTER 合并为一个单一的UPDATE消息。因此,SeaTunnel将 UPDATE_BEFORE 和 UPDATE_AFTER 编码为 Canal的 DELETE 和 INSERT 消息来进行 # 格式选项 | 选项 | 默认值 | 是否需要 | 描述 | |--------------------------------|--------|------|------------------------------------------------------------------------------------| | format | (none) | 是 | 指定要使用的格式,这里应该是 `canal_json` | | canal_json.ignore-parse-errors | false | 否 | 跳过解析错误的字段和行,而不是失败。出现错误的字段将被设置为null | | canal_json.database.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`database`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | | canal_json.table.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`table`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | # 如何使用 ## Kafka 使用示例 Canal为变更日志提供了一种统一的格式,以下是一个从MySQL products 表捕获的变更操作的简单示例 ```bash { "data": [ { "id": "111", "name": "scooter", "description": "Big 2-wheel scooter", "weight": "5.18" } ], "database": "inventory", "es": 1589373560000, "id": 9, "isDdl": false, "mysqlType": { "id": "INTEGER", "name": "VARCHAR(255)", "description": "VARCHAR(512)", "weight": "FLOAT" }, "old": [ { "weight": "5.15" } ], "pkNames": [ "id" ], "sql": "", "sqlType": { "id": 4, "name": 12, "description": 12, "weight": 7 }, "table": "products", "ts": 1589373560798, "type": "UPDATE" } ``` 注:请参考 [Canal 文档](https://github.com/alibaba/canal/wiki) 以了解每个字段的含义 MySQL 的 products 表有 4 列(id、name、description 和 weight) 上述 JSON 消息是产品表的一个更新变更事件,其中 id = 111 的行的 weight 值从 5.15 变为 5.18 假设此表的 binlog 的消息已经同步到 Kafka topic,那么我们可以使用下面的 SeaTunnel 示例来消费这个主题并体现变更事件 ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } } transform { } sink { Kafka { bootstrap.servers = "localhost:9092" topic = "consume-binlog" format = canal_json } } ``` ================================================ FILE: docs/zh/connectors/formats/cdc-compatible-debezium-json.md ================================================ # CDC 兼容 Debezium-json SeaTunnel 支持将 cdc 记录解析为 Debezium-JSON 消息,并发布到 MQ (kafka) 等消息系统中 这个特性在很多场景下都非常实用,例如,它可以实现与 Debezium 生态系统的兼容性 # 如何使用 ## MySQL-CDC 流入 Kafka ```bash env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 15000 } source { MySQL-CDC { plugin_output = "table1" url="jdbc:mysql://localhost:3306/test" "startup.mode"=INITIAL table-names=[ "database1.t1", "database1.t2", "database2.t1" ] # compatible_debezium_json options format = compatible_debezium_json debezium = { # include schema into kafka message key.converter.schemas.enable = false value.converter.schemas.enable = false # topic prefix database.server.name = "mysql_cdc_1" } } } sink { Kafka { plugin_input = "table1" bootstrap.servers = "localhost:9092" topic = "${topic}" # compatible_debezium_json options format = compatible_debezium_json } } ``` ================================================ FILE: docs/zh/connectors/formats/debezium-json.md ================================================ # Debezium 格式 变更数据捕获格式: 序列化模式、反序列化模式 Debezium 是一套分布式服务,用于捕获数据库中的变化,以便您的应用程序可以看到这些变化并对其做出响应。Debezium 在变更事件流中记录每个数据库表中的所有行级变化,应用程序只需读取这些流,就可以按照它们发生的顺序看到变更事件 SeaTunnel 支持将 Debezium JSON 消息解析为 INSERT/UPDATE/DELETE 消息并导入到 seatunnel 系统中。在许多情况下,利用这个特性是非常有用的,例如: 将增量数据从数据库同步到其他系统 审计日志 数据库的实时物化视图 关联维度数据库的变更历史,等等。 SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息解析为 Debezium JSON 消息,并将其发送到类似 Kafka 这样的存储中 # 格式选项 | 选项 | 默认值 | 是否需要 | 描述 | |-----------------------------------|--------|------|--------------------------------------| | format | (none) | 是 | 指定要使用的格式,这里应该是 'debezium_json'. | | debezium-json.ignore-parse-errors | false | 否 | 跳过有解析错误的字段和行而不是失败。如果出现错误,字段将设置为 null | # 如何使用 ## Kafka 使用示例 Debezium 提供了一个统一的变更日志格式,下面是一个 MySQL products 表捕获的变更操作的简单示例 ```bash { "before": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter ", "weight": 5.18 }, "after": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter ", "weight": 5.17 }, "source": { "version": "1.1.1.Final", "connector": "mysql", "name": "dbserver1", "ts_ms": 1589362330000, "snapshot": "false", "db": "inventory", "table": "products", "server_id": 223344, "gtid": null, "file": "mysql-bin.000003", "pos": 2090, "row": 0, "thread": 2, "query": null }, "op": "u", "ts_ms": 1589362330904, "transaction": null } ``` 注:请参考 [Debezium 文档](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#data-change-events) 以了解每个字段的含义 MySQL 的 products 表有 4 列(id、name、description 和 weight) 上述 JSON 消息是产品表的一个更新变更事件,其中 id = 111 的行的 weight 值从 5.18 变为 5.17 假设消息已经同步到 Kafka 主题 products_binlog,那么我们可以使用以下的 SeaTunnel 配置来消费这个主题并通过 Debezium 格式解释变更事件。 在此配置中,您必须指定 `schema` 和 `debezium_record_include_schema` 选项: - `schema` 应与您的表格式相同 - 如果您的 json 数据包含 `schema` 字段,`debezium_record_include_schema` 应为 true,如果您的 json 数据不包含 `schema` 字段,`debezium_record_include_schema` 应为 false - `{"schema" : {}, "payload": { "before" : {}, "after": {} ... } }` --> `true` - `{"before" : {}, "after": {} ... }` --> `false`" ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } } debezium_record_include_schema = false format = debezium_json } } transform { } sink { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "consume-binlog" format = debezium_json } } ``` ================================================ FILE: docs/zh/connectors/formats/kafka-compatible-kafkaconnect-json.md ================================================ # Kafka source 兼容 kafka-connect-json Seatunnel 的 Kafka 连接器支持解析通过 Kafka Connect Source 抽取的数据,特别是从 Kafka Connect JDBC 和 Kafka Connect Debezium 抽取的数据 # 如何使用 ## Kafka 流入 Mysql ```bash env { parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "localhost:9092" topic = "jdbc_source_record" plugin_output = "kafka_table" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = COMPATIBLE_KAFKA_CONNECT_JSON } } sink { Jdbc { driver = com.mysql.cj.jdbc.Driver url = "jdbc:mysql://localhost:3306/seatunnel" user = st_user password = seatunnel generate_sink_sql = true database = seatunnel table = jdbc_sink primary_keys = ["id"] } } ``` ================================================ FILE: docs/zh/connectors/formats/maxwell-json.md ================================================ # MaxWell 格式 [Maxwell](https://maxwells-daemon.io/) 是一个 CDC(变更数据捕获)工具,能够实时捕获 MySQL 的数据变化并将其流式传输到 Kafka、Kinesis 和其他流连接器中。Maxwell 为变更日志提供了一种统一的格式,并支持使用 JSON 进行消息的序列化。 SeaTunnel 能够解析 Maxwell 的 JSON 消息,并将其转化为 INSERT/UPDATE/DELETE 消息,进而输入到 SeaTunnel 系统中。这个特性在很多场景下都显得非常有用,例如: 从数据库同步增量数据到其他系统 审计日志 数据库的实时物化视图 关联维度数据库的变更历史,等等。 SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息编码为 Maxwell JSON 消息,并将其发送到类似 Kafka 这样的存储中。然而,目前 SeaTunnel 无法将 UPDATE_BEFORE 和 UPDATE_AFTER 合并为一个单一的 UPDATE 消息。因此,SeaTunnel 将 UPDATE_BEFORE 和 UPDATE_AFTER 编码为 Maxwell 的 DELETE 和 INSERT 消息。 # 格式选项 | 选项 | 默认值 | 是否需要 | 描述 | |------|--------|--------|------| | format | (none) | 是 | 指定要使用的格式,这里应该是 `maxwell_json`。 | | maxwell_json.ignore-parse-errors | false | 否 | 跳过解析错误的字段和行,而不是失败。出现错误的字段将被设置为 null。 | | maxwell_json.database.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Maxwell 记录中的 `database` 元字段来仅读取特定数据库变更日志行。此字符串 Pattern 模式与 Java 的 Pattern 兼容。 | | maxwell_json.table.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Maxwell 记录中的 `table` 元字段来仅读取特定表的变更日志行。此字符串 Pattern 模式与 Java 的 Pattern 兼容。 | # 如何使用 Maxwell 格式 ## Kafka 使用示例 Maxwell 为变更日志提供了一种统一的格式,以下是一个从 MySQL products 表捕获的变更操作的简单示例: ```bash { "database":"test", "table":"product", "type":"insert", "ts":1596684904, "xid":7201, "commit":true, "data":{ "id":111, "name":"scooter", "description":"Big 2-wheel scooter ", "weight":5.18 }, "primary_key_columns":[ "id" ] } ``` 注意:请参考 Maxwell 文档了解每个字段的含义。 MySQL products 表有 4 列(id、name、description 和 weight)。 上面的 JSON 消息是 products 表上的一个更新变更事件,其中 id = 111 的行的 weight 值从 5.18 更改为 5.15。 假设消息已同步到 Kafka 主题 products_binlog,那么我们可以使用以下 SeaTunnel 来消费此主题并解释变更事件。 ```bash env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafkaCluster:9092" topic = "products_binlog" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = maxwell_json } } transform { } sink { Kafka { bootstrap.servers = "localhost:9092" topic = "consume-binlog" format = maxwell_json } } ``` ================================================ FILE: docs/zh/connectors/formats/ogg-json.md ================================================ # Ogg 格式 [Oracle GoldenGate](https://www.oracle.com/integration/goldengate/) (a.k.a ogg) 是一项托管服务,提供实时数据网格平台,该平台使用复制来保持数据高度可用,并支持实时分析。客户可以设计、执行和监控其数据复制和流数据处理解决方案,而无需分配或管理计算环境。 Ogg 为变更日志提供了统一的格式结构,并支持使用 JSON 序列化消息。 SeaTunnel 支持将 Ogg JSON 消息解释为 Seatunnel 系统中的 INSERT/UPDATE/DELETE 消息。在许多情况下,这个特性带来了很多便利,例如 将增量数据从数据库同步到其他系统 审计日志 数据库的实时物化视图 关联维度数据库的变更历史,等等。 SeaTunnel 还支持将 SeaTunnel 中的 INSERT/UPDATE/DELETE 消息转化为 Ogg JSON 消息,并将其发送到类似 Kafka 这样的存储中。然而,目前 SeaTunnel 无法将 UPDATE_BEFORE 和 UPDATE_AFTER 组合成单个 UPDATE 消息。因此,Seatunnel 将 UPDATE_BEFORE 和 UPDATE_AFTER 转化为 DELETE 和 INSERT Ogg 消息来实现 # 格式选项 | 选项 | 默认值 | 是否需要 | 描述 | |------------------------------|--------|------|------------------------------------------------------------------------------------| | format | (none) | 是 | 指定要使用的格式,这里应该是`-json` | | ogg_json.ignore-parse-errors | false | 否 | 跳过有解析错误的字段和行而不是失败。如果出现错误,字段将设置为 null | | ogg_json.database.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的`database`元字段来仅读取特定数据库变更日志行。此字符串Pattern模式与Java的Pattern兼容 | | ogg_json.table.include | (none) | 否 | 正则表达式,可选,通过正则匹配 Canal 记录中的 `table` 元字段来仅读取特定表的更改日志行。此字符串Pattern模式与Java的Pattern兼容 | # 如何使用 Ogg 格式 ## Kafka 使用示例 Ogg 为变更日志提供了统一的格式,下面是从 Oracle PRODUCTS 表捕获变更操作的简单示例: ```bash { "before": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter", "weight": 5.18 }, "after": { "id": 111, "name": "scooter", "description": "Big 2-wheel scooter", "weight": 5.15 }, "op_type": "U", "op_ts": "2020-05-13 15:40:06.000000", "current_ts": "2020-05-13 15:40:07.000000", "primary_keys": [ "id" ], "pos": "00000000000000000000143", "table": "PRODUCTS" } ``` 注:各字段含义请参考 [Debezium 文档](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#data-change-events) 此 Oracle PRODUCTS 表有 4 列 (id, name, description 和 weight) 上面的 JSON 消息是 products 表上的更新更改事件,其中 id = 111 的行的字段 `weight` 的值从 5.18 更改为 5.15。 假设此表的 binlog 的消息已经同步到 Kafka topic,那么我们可以使用下面的 SeaTunnel 示例来消费这个 topic 并体现变更事件。 ```bash env { parallelism = 1 job.mode = "STREAMING" } source { Kafka { bootstrap.servers = "127.0.0.1:9092" topic = "ogg" plugin_output = "kafka_name" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "double" } }, format = ogg_json } } sink { jdbc { url = "jdbc:mysql://127.0.0.1/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "12345678" table = "ogg" primary_keys = ["id"] } } ``` ================================================ FILE: docs/zh/connectors/formats/protobuf.md ================================================ # Protobuf 格式 Protobuf(Protocol Buffers)是一种由Google开发的语言中立、平台无关的数据序列化格式。它提供了一种高效的方式来编码结构化数据,同时支持多种编程语言和平台。 目前支持在 Kafka 中使用 protobuf 格式。 ## Kafka 使用示例 - 模拟随机生成数据源,并以 protobuf 的格式 写入 kafka 的实例 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_int32 = int c_int64 = long c_float = float c_double = double c_bool = boolean c_string = string c_bytes = bytes Address { city = string state = string street = string } attributes = "map" phone_numbers = "array" } } } } sink { kafka { topic = "test_protobuf_topic_fake_source" bootstrap.servers = "kafkaCluster:9092" format = protobuf kafka.request.timeout.ms = 60000 kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ } } ``` - 从 kafka 读取 protobuf 格式的数据并打印到控制台的示例 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Kafka { topic = "test_protobuf_topic_fake_source" format = protobuf protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ schema = { fields { c_int32 = int c_int64 = long c_float = float c_double = double c_bool = boolean c_string = string c_bytes = bytes Address { city = string state = string street = string } attributes = "map" phone_numbers = "array" } } bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } sink { Console { plugin_input = "kafka_table" } } ``` ================================================ FILE: docs/zh/connectors/sink/Activemq.md ================================================ import ChangeLog from '../changelog/connector-activemq.md'; # Activemq > Activemq 接收器连接器 ## 描述 用于将数据写入 Activemq. ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-------------------------------------|---------|-----|--------------| | host | string | 否 | - | | port | int | 否 | - | | virtual_host | string | 否 | - | | username | string | 否 | - | | password | string | 否 | - | | queue_name | string | 是 | - | | uri | string | 是 | - | | check_for_duplicate | boolean | 否 | - | | client_id | boolean | 否 | - | | copy_message_on_send | boolean | 否 | - | | disable_timeStamps_by_default | boolean | 否 | - | | use_compression | boolean | 否 | - | | always_session_async | boolean | 否 | - | | dispatch_async | boolean | 否 | - | | nested_map_and_list_enabled | boolean | 否 | - | | warnAboutUnstartedConnectionTimeout | boolean | 否 | - | | closeTimeout | int | 否 | - | ### host [string] 用于连接的默认主机. ### port [int] 用于连接的默认端口 ### username [string] 用于连接的默认端口 ### password [string] 连接到代理时使用的密码 ### uri [string] 用于设置 AMQP URI 中字段(主机、端口、用户名、密码和虚拟主机)的便捷方法 ### queue_name [string] 写入消息的队列 ### check_for_duplicate [boolean] 将检查重复消息 ### client_id [string] 客户端ID ### copy_message_on_send [boolean] 如果为true,则启用新的JMS消息对象作为发送方法的一部分 ### disable_timeStamps_by_default [boolean] 禁用时间戳以获得轻微的性能提升. ### use_compression [boolean] 允许对消息正文使用压缩. ### always_session_async [boolean] 当为true时,将使用单独的线程为连接中的每个会话分派消息. ### always_sync_send [boolean] 当为true时,MessageProducer在发送消息时将始终使用同步发送 ### close_timeout [boolean] 设置关闭完成前的超时时间(以毫秒为单位). ### dispatch_async [boolean] 代理是否应该异步地向消费者发送消息 ### nested_map_and_list_enabled [boolean] 控制是否支持结构化消息属性和MapMessages ### warn_about_unstarted_connection_timeout [int] 从创建连接到生成警告的超时时间(毫秒) ## 示例 简单: ```hocon sink { ActiveMQ { uri="tcp://localhost:61616" username = "admin" password = "admin" queue_name = "test1" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Aerospike.md ================================================ import ChangeLog from '../changelog/connector-aerospike.md'; # Aerospike > Aerospike 数据写入连接器 ## 许可证兼容性通知 此连接器依赖于根据AGPL 3.0许可的Aerospike客户端库。 使用此连接器时,您需要遵守AGPL 3.0许可条款。 ## 支持引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [CDC](../../introduction/concepts/connector-v2-features.md) ## 描述 用于向 Aerospike 数据库写入数据的连接器。 ## 支持的数据源 | 数据源 | 支持版本 | Maven 依赖 | |------------|---|-------------------------------------------------------------------------| | Aerospike | 4.4.17+ | [下载](https://mvnrepository.com/artifact/com.aerospike/aerospike-client) | ## 数据类型映射 | SeaTunnel 数据类型 | Aerospike 数据类型 | 存储格式 | |----------------|--------------------|------------------------------------------------------------------------------| | STRING | STRING | 直接存储字符串 | | INT | INTEGER | 32位整型 | | BIGINT | LONG | 64位整型 | | DOUBLE | DOUBLE | 64位浮点数 | | BOOLEAN | BOOLEAN | 存储为 true/false 值 | | ARRAY | BYTEARRAY | 仅支持字节数组类型 | | LIST | LIST | 支持泛型列表类型 | | DATE | LONG | 转换为纪元时间毫秒数 | | TIMESTAMP | LONG | 转换为纪元时间毫秒数 | 注意事项: - 使用ARRAY类型时,SeaTunnel数组元素必须是byte类型 - LIST类型支持可序列化的任意元素类型 - DATE/TIMESTAMP转换使用系统默认时区 ## 配置选项 | 参数名称 | 类型 | 必填 | 默认值 | 说明 | |----------------|---------|------|---------|---------------------------------------------------------------------| | host | string | 是 | - | Aerospike 服务器主机名或IP地址 | | port | int | 否 | 3000 | Aerospike 服务器端口 | | namespace | string | 是 | - | Aerospike 命名空间 | | set | string | 是 | - | Aerospike 集合名称 | | username | string | 否 | - | 认证用户名 | | password | string | 否 | - | 认证密码 | | key | string | 是 | - | 用作 Aerospike 主键的字段名称 | | bin_name | string | 否 | - | 数据存储的 bin 名称 | | data_format | string | 否 | string | 数据存储格式:map/string/kv | | write_timeout | int | 否 | 200 | 写入操作超时时间(毫秒) | | schema.field | map | 否 | {} | 字段类型映射(示例:{"name":"STRING","age":"INTEGER"}) | ### data_format 选项说明 - **map**: 以JSON对象格式存储 - **string**: 以JSON字符串格式存储 - **kv**: 每个字段存储为独立的bin ## 任务示例 ### 简单示例 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 10 schema = { fields { id = "int" name = "string" age = "int" address = "string" } } } } sink { Aerospike { host = "localhost" port = 3000 namespace = "test_namespace" set = "user_data" key = "id" data_format = "map" write_timeout = 300 schema.field = { id = "INTEGER" name = "STRING" age = "INTEGER" address = "STRING" } } } ``` ## Changelog ================================================ FILE: docs/zh/connectors/sink/Airtable.md ================================================ import ChangeLog from '../changelog/connector-http-airtable.md'; # Airtable > Airtable Sink 连接器 ## 描述 用于将数据写入 Airtable。 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [ ] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | |--------|------|------|--------| | token | String | 是 | - | | base_id | String | 是 | - | | table | String | 是 | - | | api_base_url | String | 否 | https://api.airtable.com | | typecast | boolean | 否 | false | | batch_size | int | 否 | 10 | | request_interval_ms | int | 否 | 220 | | rate_limit_backoff_ms | int | 否 | 30000 | | rate_limit_max_retries | int | 否 | 3 | | common-options | | 否 | - | ### token [String] Airtable 个人访问令牌。可在 https://airtable.com/create/tokens 创建。 ### base_id [String] Airtable Base ID(以 `app` 开头)。 ### table [String] 要写入的表名或表 ID。 ### api_base_url [String] Airtable API 基础 URL,默认 `https://api.airtable.com`。 ### typecast [boolean] 如果为 true,Airtable 会自动将值转换为匹配的字段类型。默认 false。 ### batch_size [int] 每次 API 请求的记录数,受 Airtable API 限制最大为 10。默认 10。 ### request_interval_ms [int] API 请求之间的最小间隔(毫秒),默认 220ms。 ### rate_limit_backoff_ms [int] 收到 429(限流)响应时的基础退避时间(毫秒),默认 30000ms。 ### rate_limit_max_retries [int] 收到 429 响应后的最大重试次数,默认 3。 ### common options 汇插件通用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md)。 ## 示例 ```hocon sink { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" typecast = true batch_size = 10 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/AmazonDynamoDB.md ================================================ import ChangeLog from '../changelog/connector-amazondynamodb.md'; # AmazonDynamoDB > Amazon DynamoDB 接收器连接器 ## 描述 将数据写入 Amazon DynamoDB ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-------------------|--------|----|---------------| | url | string | 是 | - | | region | string | 是 | - | | access_key_id | string | 是 | - | | secret_access_key | string | 是 | - | | table | string | 是 | - | | batch_size | string | 否 | 25 | | common-options | | 否 | - | ### url [string] 要写入Amazon DynamoDB的URL. ### region [string] Amazon DynamoDB 的分区. ### access_key_id [string] Amazon DynamoDB的访问id. ### secret_access_key [string] Amazon DynamoDB的访问密钥. ### table [string] Amazon DynamoDB 的表名. ### 常见选项 Sink插件常用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 了解详细信息. ## 示例 ```bash Amazondynamodb { url = "http://127.0.0.1:8000" region = "us-east-1" access_key_id = "dummy-key" secret_access_key = "dummy-secret" table = "TableName" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/AmazonSqs.md ================================================ import ChangeLog from '../changelog/connector-amazonsqs.md'; # AmazonSqs > Amazon SQS 接收器连接器 ## 支持以下引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 将数据写入 Amazon SQS ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列映射](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 参数和选项 | 名称 | 类型 | 必需 | 默认值 | Description | |-------------------------|--------|--|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | 从Amazon SQS读取的队列URL. | | region | String | 否 | - | SQS服务的AWS区域 | | format | String | 否 | json | 数据格式。默认格式为json。可选文本格式,canal json和debezium json。如果你使用json或文本格式。默认字段分隔符为“,”。如果自定义分隔符,请添加“field_delimiter”选项。如果您使用canal格式,请参阅[canal-json](../formats/canal-json.md)了解详细信息。如果您使用debezium格式,请参阅[debezium json](../formats/debezium json.md)了解详细信息. | | format_error_handle_way | String | 否 | fail | 数据格式错误的处理方法。默认值为fail,可选值为(fail,skip)。当选择失败时,数据格式错误将被阻止,并引发异常。当选择跳过时,数据格式错误将跳过此行数据. | | field_delimiter | String | 否 | , | 自定义数据格式的字段分隔符. | ## 任务示例 ```bash source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } plugin_output = "fake" } } sink { AmazonSqs { url = "http://127.0.0.1:8000" region = "us-east-1" queue = "queueName" format = text field_delimiter = "|" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Assert.md ================================================ import ChangeLog from '../changelog/connector-assert.md'; # Assert > Assert 数据接收器 ## 描述 Assert 数据接收器是一个用于断言数据是否符合用户定义规则的数据接收器。用户可以通过配置规则来断言数据是否符合预期,如果数据不符合规则,将会抛出异常。 ## 核心特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) ## 配置 | Name | Type | Required | Default | |------------------------------------------------------------------------------------------------|-------------------------------------------------|----------|---------| | rules | ConfigMap | yes | - | | rules.field_rules | string | yes | - | | rules.field_rules.field_name | string\|ConfigMap | yes | - | | rules.field_rules.field_type | string | no | - | | rules.field_rules.field_value | ConfigList | no | - | | rules.field_rules.field_value.rule_type | string | no | - | | rules.field_rules.field_value.rule_value | numeric | no | - | | rules.field_rules.field_value.equals_to | boolean\|numeric\|string\|ConfigList\|ConfigMap | no | - | | rules.row_rules | string | yes | - | | rules.row_rules.rule_type | string | no | - | | rules.row_rules.rule_value | string | no | - | | rules.catalog_table_rule | ConfigMap | no | - | | rules.catalog_table_rule.primary_key_rule | ConfigMap | no | - | | rules.catalog_table_rule.primary_key_rule.primary_key_name | string | no | - | | rules.catalog_table_rule.primary_key_rule.primary_key_columns | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_name | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_type | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns | ConfigList | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_column_name | string | no | - | | rules.catalog_table_rule.constraint_key_rule.constraint_key_columns.constraint_key_sort_type | string | no | - | | rules.catalog_table_rule.column_rule | ConfigList | no | - | | rules.catalog_table_rule.column_rule.name | string | no | - | | rules.catalog_table_rule.column_rule.type | string | no | - | | rules.catalog_table_rule.column_rule.column_length | int | no | - | | rules.catalog_table_rule.column_rule.nullable | boolean | no | - | | rules.catalog_table_rule.column_rule.default_value | string | no | - | | rules.catalog_table_rule.column_rule.comment | comment | no | - | | rules.table-names | ConfigList | no | - | | rules.tables_configs | ConfigList | no | - | | rules.tables_configs.table_path | String | no | - | | common-options | | no | - | ### rules [ConfigMap] 规则定义用户可用数据的规则。每个规则代表一个字段验证或行数量验证。 ### field_rules [ConfigList] 字段规则用于字段验证 ### field_name [string] 字段名 ### field_type [string | ConfigMap] 字段类型。字段类型应符合此[指南](../../introduction/concepts/schema-feature.md#如何声明支持的类型)。 ### field_value [ConfigList] 字段值规则定义数据值验证 ### rule_type [string] 规则类型。目前支持以下规则 - NOT_NULL `值不能为空` - NULL `值可以为空` - MIN `定义数据的最小值` - MAX `定义数据的最大值` - MIN_LENGTH `定义字符串数据的最小长度` - MAX_LENGTH `定义字符串数据的最大长度` - MIN_ROW `定义最小行数` - MAX_ROW `定义最大行数` ### rule_value [numeric] 与规则类型相关的值。当`rule_type`为`MIN`、`MAX`、`MIN_LENGTH`、`MAX_LENGTH`、`MIN_ROW`或`MAX_ROW`时,用户需要为`rule_value`分配一个值。 ### equals_to [boolean | numeric | string | ConfigList | ConfigMap] `equals_to`用于比较字段值是否等于配置的预期值。用户可以将所有类型的值分配给`equals_to`。这些类型在[这里](../../introduction/concepts/schema-feature.md#目前支持哪些类型)有详细说明。 例如,如果一个字段是一个包含三个字段的行,行类型的声明是`{a = array, b = map, c={c_0 = int, b = string}}`,用户可以将值`[["a", "b"], { k0 = 9999.99, k1 = 111.11 }, [123, "abcd"]]`分配给`equals_to`。 > 定义字段值的方式与[FakeSource](../source/FakeSource.md#自定义数据内容简单示例)一致。 > > `equals_to`不能应用于`null`类型字段。但是,用户可以使用规则类型`NULL`进行验证,例如`{rule_type = NULL}`。 ### catalog_table_rule [ConfigMap] catalog_table_rule用于断言Catalog表是否与用户定义的表相同。 ### table-names [ConfigList] 用于断言表是否在数据中。 ### tables_configs [ConfigList] 用于断言多个表是否在数据中。 ### table_path [String] 表的路径。 ### common options Sink 插件的通用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 了解详情 ## 示例 ### 简单 整个Config遵循`hocon`风格 ```hocon Assert { rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 10 }, { rule_type = MIN_ROW rule_value = 5 } ], field_rules = [{ field_name = name field_type = string field_value = [ { rule_type = NOT_NULL }, { rule_type = MIN_LENGTH rule_value = 5 }, { rule_type = MAX_LENGTH rule_value = 10 } ] }, { field_name = age field_type = int field_value = [ { rule_type = NOT_NULL equals_to = 23 }, { rule_type = MIN rule_value = 32767 }, { rule_type = MAX rule_value = 2147483647 } ] } ] catalog_table_rule { primary_key_rule = { primary_key_name = "primary key" primary_key_columns = ["id"] } constraint_key_rule = [ { constraint_key_name = "unique_name" constraint_key_type = UNIQUE_KEY constraint_key_columns = [ { constraint_key_column_name = "id" constraint_key_sort_type = ASC } ] } ] column_rule = [ { name = "id" type = bigint }, { name = "name" type = string }, { name = "age" type = int } ] } } } ``` ### 复杂 这里有一个更复杂的例子,涉及到`equals_to`。 ```hocon source { FakeSource { row.num = 1 schema = { fields { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" c_map_nest = "map" c_row = { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" } } } rows = [ { kind = INSERT fields = [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], "{ 12:01:26 = v0 }", { k1 = [123, "BBB-BB"]}, [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], { k0 = v0 } ] ] } ] plugin_output = "fake" } } sink{ Assert { plugin_input = "fake" rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 1 }, { rule_type = MIN_ROW rule_value = 1 } ], field_rules = [ { field_name = c_null field_type = "null" field_value = [ { rule_type = NULL } ] }, { field_name = c_string field_type = string field_value = [ { rule_type = NOT_NULL equals_to = "AAA" } ] }, { field_name = c_boolean field_type = boolean field_value = [ { rule_type = NOT_NULL equals_to = false } ] }, { field_name = c_tinyint field_type = tinyint field_value = [ { rule_type = NOT_NULL equals_to = 1 } ] }, { field_name = c_smallint field_type = smallint field_value = [ { rule_type = NOT_NULL equals_to = 1 } ] }, { field_name = c_int field_type = int field_value = [ { rule_type = NOT_NULL equals_to = 333 } ] }, { field_name = c_bigint field_type = bigint field_value = [ { rule_type = NOT_NULL equals_to = 323232 } ] }, { field_name = c_float field_type = float field_value = [ { rule_type = NOT_NULL equals_to = 3.1 } ] }, { field_name = c_double field_type = double field_value = [ { rule_type = NOT_NULL equals_to = 9.33333 } ] }, { field_name = c_decimal field_type = "decimal(30, 8)" field_value = [ { rule_type = NOT_NULL equals_to = 99999.99999999 } ] }, { field_name = c_date field_type = date field_value = [ { rule_type = NOT_NULL equals_to = "2012-12-21" } ] }, { field_name = c_timestamp field_type = timestamp field_value = [ { rule_type = NOT_NULL equals_to = "2012-12-21T12:34:56" } ] }, { field_name = c_time field_type = time field_value = [ { rule_type = NOT_NULL equals_to = "12:34:56" } ] }, { field_name = c_bytes field_type = bytes field_value = [ { rule_type = NOT_NULL equals_to = "bWlJWmo=" } ] }, { field_name = c_array field_type = "array" field_value = [ { rule_type = NOT_NULL equals_to = [0, 1, 2] } ] }, { field_name = c_map field_type = "map" field_value = [ { rule_type = NOT_NULL equals_to = "{ 12:01:26 = v0 }" } ] }, { field_name = c_map_nest field_type = "map" field_value = [ { rule_type = NOT_NULL equals_to = { k1 = [123, "BBB-BB"] } } ] }, { field_name = c_row field_type = { c_null = "null" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_timestamp = timestamp c_time = time c_bytes = bytes c_array = "array" c_map = "map" } field_value = [ { rule_type = NOT_NULL equals_to = [ null, "AAA", false, 1, 1, 333, 323232, 3.1, 9.33333, 99999.99999999, "2012-12-21", "2012-12-21T12:34:56", "12:34:56", "bWlJWmo=", [0, 1, 2], { k0 = v0 } ] } ] } ] } } } ``` ### 验证多表 验证多个表 ```hocon env { parallelism = 1 job.mode = BATCH } source { FakeSource { tables_configs = [ { row.num = 16 schema { table = "test.table1" fields { c_int = int c_bigint = bigint } } }, { row.num = 17 schema { table = "test.table2" fields { c_string = string c_tinyint = tinyint } } } ] } } transform { } sink { Assert { rules = { tables_configs = [ { table_path = "test.table1" row_rules = [ { rule_type = MAX_ROW rule_value = 16 }, { rule_type = MIN_ROW rule_value = 16 } ], field_rules = [{ field_name = c_int field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = c_bigint field_type = bigint field_value = [ { rule_type = NOT_NULL } ] }] }, { table_path = "test.table2" row_rules = [ { rule_type = MAX_ROW rule_value = 17 }, { rule_type = MIN_ROW rule_value = 17 } ], field_rules = [{ field_name = c_string field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = c_tinyint field_type = tinyint field_value = [ { rule_type = NOT_NULL } ] }] } ] } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Cassandra.md ================================================ import ChangeLog from '../changelog/connector-cassandra.md'; # Cassandra > Cassandra 接收器连接器 ## 描述 将数据写入 Apache Cassandra. ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-------------------|---------|----|---------------| | host | String | 是 | - | | keyspace | String | 是 | - | | table | String | 是 | - | | username | String | 否 | - | | password | String | 否 | - | | datacenter | String | 否 | datacenter1 | | consistency_level | String | 否 | LOCAL_ONE | | fields | Array | 否 | - | | batch_size | int | 否 | 5000 | | batch_type | String | 否 | UNLOGGED | | async_write | boolean | 否 | true | ### host [string] `Cassandra` 的集群地址,格式为 `host:port` , 允许指定多个 `hosts` . 例如 `"cassandra1:9042,cassandra2:9042"`. ### keyspace [string] `Cassandra` 键空间. ### table [String] `Cassandra` 的表名. ### username [string] `Cassandra` 用户的用户名. ### password [string] `Cassandra` 用户的密码. ### datacenter [String] `Cassandra` 的数据中心, 默认为 `datacenter1`. ### consistency_level [String] `Cassandra` 写入一致性级别, 默认为 `LOCAL_ONE`. ### fields [array] 需要输出到 `Cassandra` 的数据字段, 如果未配置, 如果未配置,它将自动适应 sink 表 `schema`. ### batch_size [number] 通过 [Cassandra-Java-Driver](https://github.com/datastax/java-driver) 每次写入的行数, 默认值 `5000`. ### batch_type [String] `Cassandra` 批处理模式, 默认值 `UNLOGGER`. ### async_write [boolean] `cassandra` 是否以异步模式写入, 默认值 `true`. ## 示例 ```hocon sink { Cassandra { host = "localhost:9042" username = "cassandra" password = "cassandra" datacenter = "datacenter1" keyspace = "test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Clickhouse.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # Clickhouse > Clickhouse 数据连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 核心特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > Clickhouse sink 插件通过实现幂等写入可以达到精准一次,需要配合 aggregating merge tree 支持重复数据删除的引擎。 - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 描述 用于将数据写入 Clickhouse。 ## 支持的数据源信息 为了使用 Clickhouse 连接器,需要以下依赖项。它们可以通过 install-plugin.sh 或从 Maven 中央存储库下载。 | 数据源 | 支持的版本 | 依赖 | |------------|-----------|------------------------------------------------------------------------------------| | Clickhouse | universal | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-clickhouse) | ## 数据类型映射 | SeaTunnel 数据类型 | Clickhouse 数据类型 | |----------------|-----------------------------------------------------------------------------------------------------------------------------------------------| | STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | | INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | | BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | | DOUBLE | Float64 | | DECIMAL | Decimal | | FLOAT | Float32 | | DATE | Date | | TIME | DateTime | | ARRAY | Array | | MAP | Map | ## Sink 选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |---------------------------------------|---------|------|-------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | Yes | - | `ClickHouse` 集群地址, 格式是`host:port` , 允许多个`hosts`配置. 例如 `"host1:8123,host2:8123"`. | | database | String | Yes | - | `ClickHouse` 数据库名称. | | table | String | Yes | - | 表名称. | | username | String | Yes | - | `ClickHouse` 用户账号. | | password | String | Yes | - | `ClickHouse` 用户密码. | | clickhouse.config | Map | No | | 除了上述必须由 `clickhouse-jdbc` 指定的必填参数外,用户还可以指定多个可选参数,这些参数涵盖了 `clickhouse-jdbc` 提供的所有[参数](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration). | | bulk_size | String | No | 20000 | 每次通过[Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) 写入的行数,即默认是20000. | | split_mode | String | No | false | 此模式仅支持引擎为`Distributed`的 `clickhouse` 表。选项 `internal_replication` 应该是 `true` 。他们将在 seatunnel 中拆分分布式表数据,并直接对每个分片进行写入。分片权重定义为 `clickhouse` 将计算在内。 | | sharding_key | String | No | - | 使用 `split_mode` 时,将数据发送到哪个节点是个问题,默认为随机选择,但可以使用`sharding_key`参数来指定分片算法的字段。此选项仅在`split_mode`为 `true` 时有效. | | primary_key | String | No | - | 标记`clickhouse`表中的主键列,并根据主键执行INSERT/UPDATE/DELETE到`clickhouse`表. | | support_upsert | Boolean | No | false | 支持按查询主键更新插入行. | | allow_experimental_lightweight_delete | Boolean | No | false | 允许基于`MergeTree`表引擎实验性轻量级删除. | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | schema保存模式,请参考下面的`schema_save_mode` | | data_save_mode | Enum | no | APPEND_DATA | 数据保存模式,请参考下面的`data_save_mode`。 | | custom_sql | String | no | - | 当data_save_mode设置为CUSTOM_PROCESSING时,必须同时设置CUSTOM_SQL参数。CUSTOM_SQL的值为可执行的SQL语句,在同步任务开启前SQL将会被执行 | | save_mode_create_template | string | no | see below | 见下文。 | | common-options | | No | - | Sink插件查用参数,详见[Sink常用选项](../common-options/sink-common-options.md). | ### schema_save_mode [Enum] 在开启同步任务之前,针对现有的表结构选择不同的处理方案。 选项介绍: `RECREATE_SCHEMA` :表不存在时创建,表保存时删除并重建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :表不存在时会创建,表存在时跳过。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :表不存在时会报错。 `IGNORE` :忽略对表的处理。 ### data_save_mode [Enum] 在开启同步任务之前,针对目标端已有的数据选择不同的处理方案。 选项介绍: `DROP_DATA`: 保留数据库结构并删除数据。 `APPEND_DATA`:保留数据库结构,保留数据。 `CUSTOM_PROCESSING`:用户自定义处理。 `ERROR_WHEN_DATA_EXISTS`:有数据时报错。 ### save_mode_create_template 使用模板自动创建 Clickhouse 表, 会根据上游数据类型和schema类型创建相应的建表语句, 默认模板可以根据情况进行修改。 默认模板: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE = MergeTree() ORDER BY (${rowtype_primary_key}) PRIMARY KEY (${rowtype_primary_key}) SETTINGS index_granularity = 8192 COMMENT '${comment}'; ``` 如果模板中填写了自定义字段,例如添加 id 字段 ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( id, ${rowtype_fields} ) ENGINE = MergeTree() ORDER BY (${rowtype_primary_key}) PRIMARY KEY (${rowtype_primary_key}) SETTINGS index_granularity = 8192 COMMENT '${comment}'; ``` 连接器会自动从上游获取对应类型完成填充, 并从“rowtype_fields”中删除 id 字段。 该方法可用于自定义字段类型和属性的修改。 可以使用以下占位符: - database:用于获取上游schema中的数据库。 - table_name:用于获取上游schema中的表名。 - rowtype_fields:用于获取上游schema中的所有字段,自动映射到 Clickhouse 的字段描述。 - rowtype_primary_key:用于获取上游模式中的主键(可能是列表)。 - rowtype_unique_key:用于获取上游模式中的唯一键(可能是列表)。 - comment:用于获取上游模式中的表注释。 ## 示例配置与案例 ### 如何创建一个clickhouse 同步任务 以下示例演示如何创建将随机生成的数据写入Clickhouse数据库的数据同步作业。 ```bash # Set the basic configuration of the task to be performed env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 1000 } source { FakeSource { row.num = 2 bigint.min = 0 bigint.max = 10000000 split.num = 1 split.read-interval = 300 schema { fields { c_bigint = bigint } } } } sink { Clickhouse { host = "127.0.0.1:9092" database = "default" table = "test" username = "xxxxx" password = "xxxxx" } } ``` > 小提示: > > 1.[SeaTunnel 部署文档](../../getting-started/locally/deployment.md).
    > 2.需要在同步前提前创建要写入的表.
    > 3.当写入 ClickHouse 表,无需设置其结构,因为连接器会在写入前向 ClickHouse 查询当前表的结构信息.
    ### Clickhouse 接收器配置 ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" clickhouse.config = { max_rows_to_read = "100" read_overflow_mode = "throw" } } } ``` ### 切分模式 ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # split mode options split_mode = true sharding_key = "age" } } ``` ### CDC(Change data capture) Sink ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # cdc options primary_key = "id" support_upsert = true } } ``` ### CDC(Change data capture) for *MergeTree engine ```hocon sink { Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "xxxxx" password = "xxxxx" # cdc options primary_key = "id" support_upsert = true allow_experimental_lightweight_delete = true } } ``` ### 多表写入案例 在ClickHouse中提前创建下面两张数据表: ``` create table if not exists `default`.multi_sink_table1( `c_string` String, `c_boolean` Boolean, `c_tinyint` Int8, `c_smallint` Int16, `c_int` Int32, `c_bigint` Int64, `c_float` Float32, `c_double` Float64, `c_decimal` Decimal(30, 8), `c_date` Date, `c_time` DateTime64, `c_map` Map(String, Int32), `c_array` Array(Int32) )engine=Memory comment '''N''-N'; create table if not exists `default`.multi_sink_table2 as `default`.multi_sink_table1; ``` 然后使用的配置参考如下: ``` env { parallelism = 1 job.mode = "BATCH" job.name = "fake_to_clickhouse_with_multi_table" } source { FakeSource { tables_configs = [ { schema = { table = "multi_sink_table1" fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = timestamp c_map = "map" c_array = "array" } } row.num = 100 }, { schema = { table = "multi_sink_table2" fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = timestamp c_map = "map" c_array = "array" } } row.num = 100 } ] plugin_output = "multi_sink_table" } } sink { Clickhouse { plugin_input = "multi_sink_table" host = "clickhouse:8123" database = "default" table = "${table_name}" username = "default" password = "" } } ``` 提交作业并执行成功后,我们可以看到 ClickHouse 数据表 `multi_sink_table1` 和 `multi_sink_table2` 的数据量都为100. ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/ClickhouseFile.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # ClickhouseFile > Clickhouse文件数据接收器 ## 描述 该接收器使用clickhouse-local程序生成clickhouse数据文件,随后将其发送至clickhouse服务器,这个过程也称为bulkload。该接收器仅支持表引擎为 'Distributed'的表,且`internal_replication`选项需要设置为`true`。支持批和流两种模式。 ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) :::tip 提示 你也可以采用JDBC的方式将数据写入Clickhouse。 ::: ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | |------------------------|---------|------|----------------------------------------| | host | string | yes | - | | database | string | yes | - | | table | string | yes | - | | username | string | yes | - | | password | string | yes | - | | clickhouse_local_path | string | yes | - | | sharding_key | string | no | - | | copy_method | string | no | scp | | node_free_password | boolean | no | false | | node_pass | list | no | - | | node_pass.node_address | string | no | - | | node_pass.username | string | no | "root" | | node_pass.password | string | no | - | | compatible_mode | boolean | no | false | | file_fields_delimiter | string | no | "\t" | | file_temp_path | string | no | "/tmp/seatunnel/clickhouse-local/file" | | key_path | string | no | "/tmp/id_rsa" | | common-options | | no | - | ### host [string] `ClickHouse`集群地址,格式为`host:port`,允许同时指定多个`hosts`。例如`"host1:8123,host2:8123"`。 ### database [string] `ClickHouse`数据库名。 ### table [string] 表名称。 ### username [string] 连接`ClickHouse`的用户名。 ### password [string] 连接`ClickHouse`的用户密码。 ### sharding_key [string] 当ClickhouseFile需要拆分数据时,需要考虑的问题是当前数据需要发往哪个节点,默认情况下采用的是随机算法,我们也可以使用'sharding_key'参数为某字段指定对应的分片算法。 ### clickhouse_local_path [string] 在spark节点上的clickhouse-local程序路径。由于每个任务都会被调用,所以每个spark节点上的clickhouse-local程序路径必须相同。 ### copy_method [string] 为文件传输指定方法,默认为scp,可选值为scp和rsync。 ### node_free_password [boolean] 由于seatunnel需要使用scp或者rsync进行文件传输,因此seatunnel需要clickhouse服务端访问权限。如果每个spark节点与clickhouse服务端都配置了免密登录,则可以将此选项配置为true,否则需要在node_pass参数中配置对应节点的密码。 ### node_pass [list] 用来保存所有clickhouse服务器地址及其对应的访问密码。 ### node_pass.node_address [string] clickhouse服务器节点地址。 ### node_pass.username [string] clickhouse服务器节点用户名,默认为root。 ### node_pass.password [string] clickhouse服务器节点的访问密码。 ### compatible_mode [boolean] 在低版本的Clickhouse中,clickhouse-local程序不支持`--path`参数,需要设置该参数来采用其他方式实现`--path`参数功能。 ### file_fields_delimiter [string] ClickHouseFile使用CSV格式来临时保存数据。但如果数据中包含CSV的分隔符,可能会导致程序异常。使用此配置可以避免该情况。配置的值必须正好为一个字符的长度。 ### file_temp_path [string] ClickhouseFile本地存储临时文件的目录。 ### key_path [string] 用于scp或rsync传输文件的私钥路径。 ### common options Sink插件常用参数,请参考[Sink常用选项](../common-options/sink-common-options.md)获取更多细节信息。 ## 示例 ```hocon ClickhouseFile { host = "192.168.0.1:8123" database = "default" table = "fake_all" username = "default" password = "" clickhouse_local_path = "/Users/seatunnel/Tool/clickhouse local" sharding_key = "age" node_free_password = false node_pass = [{ node_address = "192.168.0.1" password = "seatunnel" }] } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Cloudberry.md ================================================ import ChangeLog from '../changelog/connector-cloudberry.md'; # Cloudberry > JDBC Cloudberry Sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过 JDBC 写入数据。Cloudberry 目前没有自己的原生驱动程序。它使用 PostgreSQL 的驱动程序进行连接,并遵循 PostgreSQL 的实现。 支持批处理模式和流模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > 使用 `XA 事务` 来确保 `精确一次`。因此,只有支持 `XA 事务` 的数据库才支持 `精确一次`。您可以设置 `is_exactly_once=true` 来启用它。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动程序 | URL | Maven | |--------|-----------|---------|-----|-------| | Cloudberry | 使用 PostgreSQL 驱动程序实现 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | ## 数据库依赖 > 请下载 PostgreSQL 驱动程序 jar 并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如:cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 Cloudberry 使用 PostgreSQL 的数据类型实现。请参考 PostgreSQL 文档了解数据类型兼容性和映射。 ## 选项 Cloudberry 连接器使用与 PostgreSQL 相同的选项。有关详细的配置选项,请参考 PostgreSQL 文档。 关键选项包括: - url(必需):JDBC 连接 URL - driver(必需):驱动程序类名(org.postgresql.Driver) - user/password:身份验证凭证 - query 或 database/table 组合:要写入的数据和方式 - is_exactly_once:使用 XA 事务启用精确一次语义 - batch_size:控制批量写入行为 ## 任务示例 ### 简单示例 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "insert into test_table(name,age) values(?,?)" } } ``` ### 生成 Sink SQL ```hocon sink { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "public.test_table" } } ``` ### 精确一次 ```hocon sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" } } ``` ### CDC(变更数据捕获)事件 ```hocon sink { jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "sink_table" primary_keys = ["id","name"] field_ide = UPPERCASE } } ``` ### 保存模式功能 ```hocon sink { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" generate_sink_sql = true database = "mydb" table = "public.test_table" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` 有关更多详细的示例和选项,请参考 PostgreSQL 连接器文档。 ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Console.md ================================================ import ChangeLog from '../changelog/connector-console.md'; # Console > Console 数据接收器 ## 支持连接器版本 - 所有版本 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 接收Source端传入的数据并打印到控制台。支持批同步和流同步两种模式。 > 例如,来自上游的数据为 [`age: 12, name: jared`] ,则发送到控制台的内容为: `{"name":"jared","age":17}` ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |--------------------|---------|------|-----|-----------------------------------------------------------| | common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项](../common-options/sink-common-options.md) 了解详情 | | log.print.data | boolean | 否 | - | 确定是否应在日志中打印数据的标志。默认值为`true` | | log.print.delay.ms | int | 否 | - | 将每个数据项打印到日志之间的延迟(以毫秒为单位)。默认值为`0` | ## 任务示例 ### 简单示例 > 随机生成的数据,包含两个字段,即 `name`(字符串类型)和 `age`(整型),写入控制台,并行度为 `1` ``` env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake" schema = { fields { name = "string" age = "int" } } } } sink { Console { plugin_input = "fake" } } ``` ### 多数据源示例 > 多数据源示例,通过配置可以指定数据源写入指定接收器 ``` env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake1" schema = { fields { id = "int" name = "string" age = "int" sex = "string" } } } FakeSource { plugin_output = "fake2" schema = { fields { name = "string" age = "int" } } } } sink { Console { plugin_input = "fake1" } Console { plugin_input = "fake2" } } ``` ## 控制台示例数据 控制台打印的输出: ``` 2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age 2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/CosFile.md ================================================ import ChangeLog from '../changelog/connector-file-cos.md'; # CosFile > Cos 文件接收器连接器 ## 描述 将数据输出到cos文件系统. :::提示 如果你使用spark/flink,为了使用这个连接器,你必须确保你的spark/flilk集群已经集成了hadoop。测试的hadoop版本是2.x 如果你使用SeaTunnel Engine,当你下载并安装SeaTunnel引擎时,它会自动集成hadoop jar。您可以在${SEATUNNEL_HOME}/lib下检查jar包以确认这一点. 要使用此连接器,您需要将hadoop cos-{hadoop.version}-{version}.jar和cos_api-bundle-{version}.jar位于${SEATUNNEL_HOME}/lib目录中,下载:[Hoop cos发布](https://github.com/tencentyun/hadoop-cos/releases). 它只支持hadoop 2.6.5+和8.0.2版本+. ::: ## 关键特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保 `精确一次` - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |---------------------------------------|---------|----|--------------------------------------------|-----------------------------------------------------------------| | path | string | 是 | - | | | tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入tmp路径,然后使用“mv”将tmp目录提交到目标目录。需要一个COS目录. | | bucket | string | 是 | - | | | secret_id | string | 是 | - | | | secret_key | string | 是 | - | | | region | string | 是 | - | | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在custom_filename为true时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在custom_filename为true时使用 | | file_format_type | string | 否 | "csv" | | | filename_extension | string | 否 | - | 使用自定义的文件扩展名覆盖默认的文件扩展名。 例如:`.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | 否 | '\001' | 仅在file_format为text时使用 | | row_delimiter | string | 否 | "\n" | 仅在file_format为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区. | | partition_by | array | 否 | - | 只有在have_partition为true时才使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 只有在have_partition为true时才使用 | | is_partition_field_write_in_file | boolean | 否 | false | 只有在have_partition为true时才使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是接收列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅在file_format为excel时使用. | | sheet_name | string | 否 | Sheet${Random number} | 仅在file_format为excel时使用. | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在file_format为csv时使用. | | xml_root_tag | string | 否 | RECORDS | 仅在file_format为xml时使用. | | xml_row_tag | string | 否 | RECORD | 仅在file_format为xml时使用. | | xml_use_attr_format | boolean | 否 | - | 仅在file_format为xml时使用. | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。启用此参数后,batch_size将不会生效。输出文件名没有文件块后缀. | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件. | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在file_format为parquet时使用. | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在file_format为parquet时使用. | | encoding | string | 否 | "UTF-8" | 仅当file_format_type为json、text、csv、xml时使用. | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### path [string] 目标目录路径是必需的. ### bucket [string] cos文件系统的bucket地址,例如:`cosn://seatunnel-test-1259587829` ### secret_id [string] cos文件系统的密钥id. ### secret_key [string] cos文件系统的密钥. ### region [string] cos文件系统的分区. ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅在 `custom_filename` 为 `true`时使用 `file_name_expression`描述了将在`path`中创建的文件表达式。我们可以在`file_name_expression`中添加变量`${now}`或`${uuid}`,类似于`test_${uuid}_${now}`, `${now}`表示当前时间,其格式可以通过指定选项`filename_time_format`来定义. 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头 ### filename_time_format [string] 仅在 `custom_filename` 为 `true` 时使用` 当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下: | 符号| 描述 | |--------|----------| | y | 年 | | M | 月 | | d | 日 | | H | 时 (0-23) | | m | 分 | | s | 秒 | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以 file_format 的后缀结尾, 文本文件的后缀为 `txt`. ### field_delimiter [string] 数据行中列之间的分隔符. 仅需要 `text` 文件格式. ### row_delimiter [string] 文件中行之间的分隔符. 只需要 `text`、`csv`、`json` 文件格式. ### have_partition [boolean] 是否需要处理分区. ### partition_by [array] 仅在 `have_partition` 为 `true` 时使用. 基于选定字段对数据进行分区. ### partition_dir_expression [string] 仅在 `have_partition` 为 `true` 时使用. 如果指定了 `partition_by` ,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 默认的 `partition_dir_expression` 是 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` 是第一个分区字段 , `v0` 是第一个划分字段的值. ### is_partition_field_write_in_file [boolean] 仅在 `have_partition` 为 `true` 时使用. 如果 `is_partition_field_write_in_file` 为 `true`, 分区字段及其值将写入数据文件. 例如,如果你想写一个Hive数据文件,它的值应该是 `false`. ### sink_columns [array] 哪些列需要写入文件,默认值是从 `Transform` 或 `Source` 获取的所有列. 字段的顺序决定了文件实际写入的顺序. ### is_enable_transaction [boolean] 如果 `is_enable_transaction` 为 `true`, 我们将确保数据在写入目标目录时不会丢失或重复. 请注意,如果 `is_enable_transaction` 为 `true`, 我们将自动添加 `${transactionId}_` 在文件的开头. 现在只支持 `true` . ### batch_size [int] 文件中的最大行数。对于SeaTunnel引擎,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定. 如果 `checkpoint.interval` 的值足够大, 接收器写入程序将在文件中写入行,直到文件中的行大于 `batch_size`. 如果 `checkpoint.interval` 较小, 则接收器写入程序将在新的检查点触发时创建一个新文件. ### compress_codec [string] 文件的压缩编解码器和支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` Tips: excel 类型不支持任何压缩格式 ### common options 接收器写入插件常用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 了解详细信息. ### max_rows_in_memory [int] 当文件格式为Excel时,内存中可以缓存的最大数据项数. ### sheet_name [string] 编写工作簿的工作表 ### csv_string_quote_mode [string] 当文件格式为CSV时,CSV的字符串引用模式. - ALL: 所有字符串字段都将被引用. - MINIMAL: 引号字段包含特殊字符,如字段分隔符、引号字符或行分隔符字符串中的任何字符. - NONE: 从不引用字段。当分隔符出现在数据中时,打印机会用转义符作为前缀。如果未设置转义符,格式验证将抛出异常. ### xml_root_tag [string] 指定XML文件中根元素的标记名. ### xml_row_tag [string] 指定XML文件中数据行的标记名称. ### xml_use_attr_format [boolean] 指定是否使用标记属性格式处理数据. ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入Parquet INT96,仅适用于拼花地板文件. ### parquet_avro_write_fixed_as_int96 [array] 支持从12字节字段写入Parquet INT96,仅适用于拼花地板文件. ### encoding [string] 仅当file_format_type为json、text、csv、xml时使用. 要写入的文件的编码。此参数将由`Charset.forName(encoding)` 解析. ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 示例 对于具有 `have_partition` 、 `custom_filename` 和 `sink_columns` 的文本文件格式 ```hocon CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` 适用于带有`have_partition` 和 `sink_columns`的parquet 文件格式` ```hocon CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] } ``` 对于orc文件格式的简单配置 ```bash CosFile { path="/sink" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "orc" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/DB2.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DB2 > JDBC DB2接收器连接器 ## 支持以下引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过jdbc写入数据。支持批处理模式和流模式,支持并发写入,只支持一次 语义(使用XA事务保证). ## 使用依赖关系 ### 适用于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) 已放置在目录 `${SEATUNNEL_HOME}/plugins/`. ### 适用于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) 已放置在目录 `${SEATUNNEL_HOME}/lib/`. ## 关键特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > 使用 `Xa transactions` 来确保 `精确一次`. 因此,数据库只支持 `exactly-once` 即 > 支持 `Xa transactions`. 您可以设置 `is_exactly_once=true` 来启用它. ## 支持的数据源信息 | 数据库 | 支持版本 | 驱动 | Url | Maven | |------------|---------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| | DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | ## 数据类型映射 | DB2数据类型 | SeaTunnel 数据类型 | |------------------------------------------------------------------------------------------------------|---------------------| | BOOLEAN | BOOLEAN | | SMALLINT | SHORT | | INT
    INTEGER
    | INTEGER | | BIGINT | LONG | | DECIMAL
    DEC
    NUMERIC
    NUM | DECIMAL(38,18) | | REAL | FLOAT | | FLOAT
    DOUBLE
    DOUBLE PRECISION
    DECFLOAT | DOUBLE | | CHAR
    VARCHAR
    LONG VARCHAR
    CLOB
    GRAPHIC
    VARGRAPHIC
    LONG VARGRAPHIC
    DBCLOB | STRING | | BLOB | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | ROWID
    XML | Not supported yet | ## 选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | JDBC连接的URL。请参考案例 : jdbc:db2://127.0.0.1:50000/dbname | | driver | String | Yes | - | 用于连接到远程数据源的jdbc类名,
    如果使用DB2,则值为 `com.ibm.db2.jdbc.app.DB2Driver`. | | username | String | No | - | 连接实例用户名 | | password | String | No | - | 连接实例密码 | | query | String | No | - | 使用此sql将上游输入数据写入数据库。例如 `INSERT ...`,`query` 具有更高的优先级 | | database | String | No | - | 使用这个 `database` 和 `table-name` 自动生成sql并接收上游输入数据写入数据库.
    此选项与 `query` 互斥,具有更高的优先级. | | table | String | No | - | 使用数据库和此表名自动生成sql并接收上游输入数据写入数据库.
    此选项与 `query` 互斥,具有更高的优先级. | | primary_keys | Array | No | - | 此选项用于在自动生成sql时支持 `insert`, `delete`, 和 `update` 等操作. | | connection_check_timeout_sec | Int | No | 30 | 等待用于验证连接的数据库操作完成的时间(秒). | | max_retries | Int | No | 0 | 提交失败的重试次数 (执行批处理) | | batch_size | Int | No | 1000 | 对于批量写入,当缓冲记录的数量达到 `batch_size` 的数量或时间达到 `checkpoint.interval` 时
    , 数据将被刷新到数据库中 | | is_exactly_once | Boolean | No | false | 是否启用精确一次语义,这将使用 Xa 事务. 如果启用,则需要
    设置 `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | 根据要写入的数据库表生成sql语句 | | xa_data_source_class_name | String | No | - | 数据库Driver的 xa 数据源类名, for example, DB2 是 `com.db2.cj.jdbc.Db2XADataSource`,
    其他数据来源请参考附录 | | max_commit_attempts | Int | No | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | No | -1 | 事务打开后的超时,默认值为-1(永不超时). 请注意,设置超时可能会影响<br/>精确一次语义 | | auto_commit | Boolean | No | true | 默认情况下启用自动事务提交 | | properties | Map | No | - | 附加连接配置参数,当属性和URL具有相同的参数时,优先级由驱动程序的特定实现决定. 例如,在MySQL中,属性优先于URL. | | common-options | | no | - | Sink插件常用参数,详见 [Sink Common Options](../common-options/sink-common-options.md) | ### 小贴士 > 如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发性并行执行. ## 任务示例 ### 简单 > 此示例定义了一个SeaTunnel同步任务,该任务通过FakeSource自动生成数据并将其发送到JDBC Sink。FakeSource总共生成16行数据(row.num=16),每行有两个字段,name(字符串类型)和age(int类型)。最终的目标表是test_table,表中也将有16行数据。在运行此作业之前,您需要在DB2中创建数据库测试和表test_table。如果您尚未安装和部署SeaTunnel,则需要按照[Install SeaTunnel](../../getting-started/locally/deployment.md)中的说明安装和部署SeaTunnel。然后按照[Quick Start With SeaTunnel Engine](../../getting-started/locally/quick-start-seatunnel-engine.md) 中的说明运行此作业. ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件 **仅用于测试和演示功能源插件** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # 如果你想了解更多关于如何配置seatunnel的信息,并查看完整的源插件列表, # 请前往 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表 # 请前往 https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # 如果你想了解更多关于如何配置seatunnel的信息,并查看完整的接收插件列表, # 请前往 https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成 Sink SQL > 此示例不需要编写复杂的sql语句,您可以配置数据库名称表名以自动为您生成add语句 ``` sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### 精确一次 > 为了准确的书写场景,我们保证一次准确 ``` sink { jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.db2.cj.jdbc.Db2XADataSource" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Databend.md ================================================ import ChangeLog from '../changelog/connector-databend.md'; # Databend > Databend sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [支持多表写入](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) ## 描述 用于向 Databend 写入数据的 sink 连接器。支持批处理和流处理模式。 Databend sink 内部通过 stage attachment 实现数据的批量导入。 ## 依赖 ### 对于 Spark/Flink > 1. 你需要下载 [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) 并添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta > 1. 你需要下载 [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) 并添加到目录 `${SEATUNNEL_HOME}/lib/`. ## Sink 选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |---------------------|------|----------|--------|------------------------------------| | url | String | 是 | - | Databend JDBC 连接 URL | | username | String | 是 | - | Databend 数据库用户名 | | password | String | 是 | - | Databend 数据库密码 | | database | String | 否 | - | Databend 数据库名称,默认使用连接 URL 中指定的数据库名 | | table | String | 否 | - | Databend 表名称 | | batch_size | Integer | 否 | 1000 | 批量写入的记录数 | | auto_commit | Boolean | 否 | true | 是否自动提交事务 | | max_retries | Integer | 否 | 3 | 写入失败时的最大重试次数 | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 保存 Schema 的模式 | | data_save_mode | Enum | 否 | APPEND_DATA | 保存数据的模式 | | custom_sql | String | 否 | - | 自定义写入 SQL,通常用于复杂的写入场景 | | execute_timeout_sec | Integer | 否 | 300 | 执行SQL的超时时间(秒) | | jdbc_config | Map | 否 | - | 额外的 JDBC 连接配置,如连接超时参数等 | | conflict_key | String | 否 | - | cdc 模式下的冲突键,用于确定冲突解决的主键 | | enable_delete | Boolean | 否 | false | cdc 模式下是否允许删除操作 | ### schema_save_mode [Enum] 在开启同步任务之前,针对现有的表结构选择不同的处理方案。 选项介绍: `RECREATE_SCHEMA` :表不存在时创建,表存在时删除并重建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :表不存在时会创建,表存在时跳过。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :表不存在时会报错。 `IGNORE` :忽略对表的处理。 ### data_save_mode [Enum] 在开启同步任务之前,针对目标端已有的数据选择不同的处理方案。 选项介绍: `DROP_DATA`: 保留数据库结构并删除数据。 `APPEND_DATA`:保留数据库结构,保留数据。 `CUSTOM_PROCESSING`:用户自定义处理。 `ERROR_WHEN_DATA_EXISTS`:有数据时报错。 ## 数据类型映射 | SeaTunnel 数据类型 | Databend 数据类型 | |-----------------|---------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | STRING | STRING | | BYTES | VARBINARY | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | ## 任务示例 ### 简单示例 ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { row.num = 10 schema = { fields { name = string age = int score = double } } } } sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" batch_size = 1000 } } ``` ### 使用自定义 SQL 写入 ```hocon sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" custom_sql = "INSERT INTO default.target_table(name, age, score) VALUES(?, ?, ?)" } } ``` ### 使用 Schema 保存模式 ```hocon sink { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "target_table" schema_save_mode = "RECREATE_SCHEMA" data_save_mode = "APPEND_DATA" } } ``` ### CDC mode ```hocon sink { Databend { url = "jdbc:databend://databend:8000/default?ssl=false" username = "root" password = "" database = "default" table = "sink_table" # Enable CDC mode batch_size = 1 interval = 3 conflict_key = "id" enable_delete = true } } ``` ## 相关链接 - [Databend 官方网站](https://databend.rs/) - [Databend JDBC 驱动](https://github.com/databendlabs/databend-jdbc/) ## Changelog ================================================ FILE: docs/zh/connectors/sink/Datahub.md ================================================ import ChangeLog from '../changelog/connector-datahub.md'; # DataHub > DataHub 接收器连接器 ## 描述 一个使用向 DataHub 发送消息的接收器插件 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |----------------|--------|-----|------| | endpoint | string | 是 | - | | accessId | string | 是 | - | | accessKey | string | 是 | - | | project | string | 是 | - | | topic | string | 是 | - | | timeout | int | 否 | 3000 | | retryTimes | int | 否 | 3 | | common-options | | 否 | - | ### endpoint [string] 您的DataHub端点以http开头 ### accessId [string] 您的DataHub accessId可以从阿里云访问哪个云 ### accessKey [string] 您的DataHub accessKey可以从阿里云访问哪个云 ### project [string] 您在阿里云中创建的DataHub项目 ### topic [string] 您的DataHub主题 ### timeout [int] 最大连接超时 ### retryTimes [int] 客户端放置记录失败时的最大重试次数 ### common options 接收器插件常用参数,详见 [Sink Common Options](../common-options/sink-common-options.md) ## 示例 ```hocon sink { DataHub { endpoint="yourendpoint" accessId="xxx" accessKey="xxx" project="projectname" topic="topicname" timeout=3000 retryTimes=3 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/DingTalk.md ================================================ import ChangeLog from '../changelog/connector-dingtalk.md'; # 钉钉 > 钉钉 数据接收器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 描述 一个使用钉钉机器人发送消息的Sink插件。 ## Options | 名称 | 类型 | 是否必须 | 默认值 | |----------------|--------|------|-----| | url | String | 是 | - | | secret | String | 是 | - | | common-options | | 否 | - | ### url [String] 钉钉机器人地址格式为 https://oapi.dingtalk.com/robot/send?access_token=XXXXXX(String) ### secret [String] 钉钉机器人的密钥 (String) ### common options Sink插件的通用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 了解详情 ## 任务示例 ```hocon sink { DingTalk { url="https://oapi.dingtalk.com/robot/send?access_token=ec646cccd028d978a7156ceeac5b625ebd94f586ea0743fa501c100007890" secret="SEC093249eef7aa57d4388aa635f678930c63db3d28b2829d5b2903fc1e5c10000" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Doris.md ================================================ import ChangeLog from '../changelog/connector-doris.md'; # Doris > Doris sink 连接器 ## 支持的doris版本 - exactly-once & cdc 支持 `Doris version is >= 1.1.x` - 支持数组数据类型 `Doris version is >= 1.2.x` - 将支持Map数据类型 `Doris version is 2.x` ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 用于发送数据到doris. 同时支持流模式和批模式处理. Doris Sink连接器的内部实现是通过stream load批量缓存和导入的。 ## 依赖 ### 对于 Spark/Flink > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/lib/`. ## Sink 选项 | Name | Type | Required | Default | Description | |--------------------------------|---------|----------|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| | fenodes | String | Yes | - | `Doris` 集群 fenodes 地址, 格式是 `"fe_ip:fe_http_port, ..."` | | query-port | int | No | 9030 | `Doris` Fenodes mysql协议查询端口 | | username | String | Yes | - | `Doris` 用户名 | | password | String | Yes | - | `Doris` 密码 | | database | String | Yes | - | `Doris`数据库名称 , 使用 `${database_name}` 表示上游数据库名称。 | | table | String | Yes | - | `Doris` 表名, 使用 `${table_name}` 表示上游表名。 | | table.identifier | String | Yes | - | `Doris` 表的名称,2.3.5 版本后将弃用,请使用 `database` 和 `table` 代替。 | | sink.label-prefix | String | Yes | - | stream load导入使用的标签前缀。 在2pc场景下,需要全局唯一性来保证SeaTunnel的EOS语义。 | | sink.enable-2pc | bool | No | false | 是否启用两阶段提交(2pc),默认为 false。 对于两阶段提交,请参考[此处](https://doris.apache.org/docs/data-operate/transaction?_highlight=two&_highlight=phase#stream-load-2pc)。 | | sink.enable-delete | bool | No | - | 是否启用删除。 该选项需要Doris表开启批量删除功能(0.15+版本默认开启),且仅支持Unique模型。 您可以在此[link](https://doris.apache.org/docs/dev/data-operate/delete/batch-delete-manual/)获得更多详细信息 | | sink.check-interval | int | No | 10000 | 加载过程中检查异常时间间隔。 | | sink.max-retries | int | No | 3 | 向数据库写入记录失败时的最大重试次数。 | | sink.buffer-size | int | No | 256 * 1024 | 用于缓存stream load数据的缓冲区大小。 | | sink.buffer-count | int | No | 3 | 用于缓存stream load数据的缓冲区计数。 | | doris.batch.size | int | No | 1024 | 每次http请求写入doris的批量大小,当row达到该大小或者执行checkpoint时,缓存的数据就会写入服务器。 | | needs_unsupported_type_casting | boolean | No | false | 是否启用不支持的类型转换,例如 Decimal64 到 Double。 | | case_sensitive | boolean | No | true | 是否保留表名和字段名的原始大小写。当设置为 false 时,表名和字段名将被转换为小写。 | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | schema保存模式,请参考下面的`schema_save_mode` | | data_save_mode | Enum | no | APPEND_DATA | 数据保存模式,请参考下面的`data_save_mode`。 | | save_mode_create_template | string | no | see below | 见下文。 | | custom_sql | String | no | - | 当data_save_mode选择CUSTOM_PROCESSING时,需要填写CUSTOM_SQL参数。 该参数通常填写一条可以执行的SQL。 SQL将在同步任务之前执行。 | | doris.config | map | yes | - | 该选项用于支持自动生成sql时的insert、delete、update等操作,以及支持的格式。 | ### schema_save_mode [Enum] 在开启同步任务之前,针对现有的表结构选择不同的处理方案。 选项介绍: `RECREATE_SCHEMA` :表不存在时创建,表保存时删除并重建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :表不存在时会创建,表存在时跳过。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :表不存在时会报错。 `IGNORE` :忽略对表的处理。 ### data_save_mode [Enum] 在开启同步任务之前,针对目标端已有的数据选择不同的处理方案。 选项介绍: `DROP_DATA`: 保留数据库结构并删除数据。 `APPEND_DATA`:保留数据库结构,保留数据。 `CUSTOM_PROCESSING`:用户自定义处理。 `ERROR_WHEN_DATA_EXISTS`:有数据时报错。 ### save_mode_create_template 使用模板自动创建Doris表, 会根据上游数据类型和schema类型创建相应的建表语句, 默认模板可以根据情况进行修改。 默认模板: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP UNIQUE KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_allocation" = "tag.location.default: 1", "in_memory" = "false", "storage_format" = "V2", "disable_auto_compaction" = "false" ) ``` 如果模板中填写了自定义字段,例如添加 id 字段 ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( id, ${rowtype_fields} ) ENGINE = OLAP UNIQUE KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1" ); ``` 连接器会自动从上游获取对应类型完成填充, 并从"rowtype_fields"中删除 id 字段。 该方法可用于自定义字段类型和属性的修改。 可以使用以下占位符: - database:用于获取上游schema中的数据库。 - table_name:用于获取上游schema中的表名。 - rowtype_fields:用于获取上游schema中的所有字段,自动映射到Doris的字段描述。 - rowtype_primary_key:用于获取上游模式中的主键(可能是列表)。 - rowtype_unique_key:用于获取上游模式中的唯一键(可能是列表)。 - comment:用于获取上游模式中的表注释。 ## 数据类型映射 | Doris 数据类型 | SeaTunnel 数据类型 | |----------------|-----------------------------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT
    TINYINT | | INT | INT
    SMALLINT
    TINYINT | | BIGINT | BIGINT
    INT
    SMALLINT
    TINYINT | | LARGEINT | BIGINT
    INT
    SMALLINT
    TINYINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE
    FLOAT | | DECIMAL | DECIMAL
    DOUBLE
    FLOAT | | DATE | DATE | | DATETIME | TIMESTAMP | | CHAR | STRING | | VARCHAR | STRING | | STRING | STRING | | ARRAY | ARRAY | | MAP | MAP | | JSON | STRING | | HLL | 尚不支持 | | BITMAP | 尚不支持 | | QUANTILE_STATE | 尚不支持 | | STRUCT | 尚不支持 | #### 支持的导入数据格式 支持的格式包括 CSV 和 JSON。 ## 调优指南 适当增加`sink.buffer-size`和`doris.batch.size`的值可以提高写性能。 在流模式下,如果`doris.batch.size`和`checkpoint.interval`都配置为较大的值,最后到达的数据可能会有较大的延迟(延迟的时间就是检查点间隔的时间)。 这是因为最后到达的数据总量可能不会超过doris.batch.size指定的阈值。因此,在接收到数据的数据量没有超过该阈值之前只有检查点才会触发提交操作。因此,需要选择一个合适的检查点间隔。 此外,如果你通过`sink.enable-2pc=true`属性启用2pc。`sink.buffer-size`将会失去作用,只有检查点才能触发提交。 ## 任务示例 ### 简单示例 > 下面的例子描述了向Doris写入多种数据类型,用户需要在下游创建对应的表。 ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### CDC(监听数据变更捕获)事件 > 本示例定义了一个SeaTunnel同步任务,通过FakeSource自动生成数据并发送给Doris Sink,FakeSource使用schema、score(int类型)模拟CDC数据,Doris需要创建一个名为test.e2e_table_sink的sink任务及其对应的表 。 ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int sex = boolean number = tinyint height = float sight = double create_time = date update_time = timestamp } } rows = [ { kind = INSERT fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = UPDATE_BEFORE fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] }, { kind = DELETE fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] } ] } } sink { Doris { fenodes = "doris_cdc_e2e:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### 使用JSON格式导入数据 ``` sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.enable-2pc = "true" sink.label-prefix = "test_json" doris.config = { format="json" read_json_by_line="true" } } } ``` ### 使用CSV格式导入数据 ``` sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "test" table = "e2e_table_sink" sink.enable-2pc = "true" sink.label-prefix = "test_csv" doris.config = { format = "csv" column_separator = "," } } } ``` ### 大小写敏感配置 ```hocon sink { Doris { fenodes = "e2e_dorisdb:8030" username = root password = "" database = "Test_DB" # 保留原始大小写 table = "Test_Table" # 保留原始大小写 case_sensitive = true # 默认值,保留原始大小写 sink.enable-2pc = "true" sink.label-prefix = "test_case_sensitive" doris.config = { format = "json" read_json_by_line = "true" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Druid.md ================================================ import ChangeLog from '../changelog/connector-druid.md'; # Druid > Druid 接收器连接器 ## 描述 一个使用向 Druid 发送消息的接收器插件 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 数据类型映射 | SeaTunnel 数据类型 | Druid 数据类型 | |----------------|-----------------| | TINYINT | LONG | | SMALLINT | LONG | | INT | LONG | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DOUBLE | | STRING | STRING | | BOOLEAN | STRING | | TIMESTAMP | STRING | ## 选项 | 名称 | 类型 | 必需 | 默认值 | |----------------|--------|----|---------------| | coordinatorUrl | string | 是 | - | | datasource | string | 是 | - | | batchSize | int | 否 | 10000 | | common-options | | 否 | - | ### coordinatorUrl [string] Druid的协调器URL主机和端口,示例: "myHost:8888" ### datasource [string] 要写入的数据源名称,示例: "seatunnel" ### batchSize [int] 每批刷新为Druid的行数。默认值为 `1024`. ### common options Sink插件常用参数,详见 [Sink Common Options](../common-options/sink-common-options.md) for details ## 示例 简单的例子: ```hocon sink { Druid { coordinatorUrl = "testHost:8888" datasource = "seatunnel" } } ``` 使用占位符获取上游表元数据示例: ```hocon sink { Druid { coordinatorUrl = "testHost:8888" datasource = "${table_name}_test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/DuckDB.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DuckDB > JDBC DuckDB Sink 连接器 ## 支持 DuckDB 版本 - 0.8.x/0.9.x/0.10.x/1.x ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过 jdbc 写入数据。支持批处理模式和流处理模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要功能 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [CDC](../../introduction/concepts/connector-v2-features.md) > 使用 `Xa 事务` 来确保 `精确一次`。因此只支持支持 `Xa 事务` 的数据库的 `精确一次`。您可以设置 `is_exactly_once=true` 来启用它。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动器 | 网址 | Maven下载链接 | |--------|--------------------|-------------------------|----------------------------------|-----------------------------------------------------------------| | DuckDB | 不同的依赖版本具有不同的驱动程序类。 | org.duckdb.DuckDBDriver | jdbc:duckdb:/path/to/database.db | [下载](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) | ## 数据类型映射 | SeaTunnel 数据类型 | DuckDB 数据类型 | |---------------------------------|----------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT
    INT | INTEGER | | BIGINT | BIGINT | | DECIMAL(x,y)(获取指定列的指定列大小.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的指定列大小.>38) | DECIMAL(38,18) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | VARCHAR | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | BYTES
    ARRAY
    ROW
    MAP | BLOB | ## Sink 选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |------------------------------|---------|------|------------------------------|---------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考案例:jdbc:duckdb:/path/to/database.db | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,
    如果您使用 DuckDB,值为 `org.duckdb.DuckDBDriver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 否 | - | 使用此 sql 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 具有更高的优先级 | | database | String | 否 | main | 使用此 `database` 和 `table-name` 自动生成 sql 并接收上游输入数据写入数据库。
    此选项与 `query` 互斥且具有更高的优先级。 | | table | String | 否 | - | 使用数据库和此表名自动生成 sql 并接收上游输入数据写入数据库。
    此选项与 `query` 互斥且具有更高的优先级。 | | primary_keys | Array | 否 | - | 此选项用于在自动生成 sql 时支持 `insert`、`delete` 和 `update` 等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(以秒为单位)。 | | max_retries | Int | 否 | 0 | 提交失败(executeBatch)的重试次数 | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录数达到 `batch_size` 数量或时间达到 `checkpoint.interval`
    时,数据将被刷新到数据库中 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,将使用 Xa 事务。如果开启,您需要
    设置 `xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据您要写入的数据库表生成 sql 语句 | | xa_data_source_class_name | String | 否 | - | 数据库驱动程序的 xa 数据源类名,例如,DuckDB 是 `org.duckdb.DuckDBXADataSource`,
    其他数据源请参考附录 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时时间,默认为 -1(永不超时)。请注意,设置超时可能会影响
    精确一次语义 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交 | | field_ide | String | 否 | - | 标识从源同步到接收器时字段是否需要转换。`ORIGINAL` 表示不需要转换;`UPPERCASE` 表示转换为大写;`LOWERCASE` 表示转换为小写。 | | properties | Map | 否 | - | 附加连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
    驱动程序的具体实现确定。例如,在 DuckDB 中,properties 优先于 URL。 | | common-options | | 否 | - | Sink 插件通用参数,详情请参考 [Sink Common Options](../sink-common-options.md) | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在同步任务开启之前,针对目标端已有的表结构选择不同的处理方案。 | | data_save_mode | Enum | 否 | APPEND_DATA | 在同步任务开启之前,针对目标端已有数据选择不同的处理方案。 | | custom_sql | String | 否 | - | 当 data_save_mode 选择 CUSTOM_PROCESSING 时,应填写 CUSTOM_SQL 参数。此参数通常填写可执行的 SQL。SQL 将在同步任务之前执行。 | | enable_upsert | Boolean | 否 | true | 通过 primary_keys 存在启用 upsert,如果任务只有 `insert`,将此参数设置为 `false` 可以加快数据导入速度 | ### 提示 > 如果未设置 partition_column,它将以单一并发运行,如果设置了 partition_column,它将根据任务的并发度并行执行。 ## 任务示例 ### 简单 ``` env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 row_num = 1000 schema = { fields { id = "int" name = "string" age = "int" email = "string" } } } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "duckdb" password = "" } } ``` ### CDC(变更数据捕获)事件 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { base-url = "jdbc:mysql://localhost:3306/test" username = "root" password = "123456" table-names = ["test.user"] } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "duckdb" password = "" generate_sink_sql = true # 您需要同时配置 database 和 table database = main table = "sink_table" primary_keys = ["id"] } } ``` ### 精确一次 ``` env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 row_num = 1000 schema = { fields { id = "int" name = "string" age = "int" email = "string" } } } } sink { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" table = "sink_table" username = "" password = "" is_exactly_once = "true" xa_data_source_class_name = "org.duckdb.DuckDBXADataSource" } } ``` ## Changelog ================================================ FILE: docs/zh/connectors/sink/Easysearch.md ================================================ import ChangeLog from '../changelog/connector-easysearch.md'; # INFINI Easysearch ## 支持以下引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 一个使用将数据发送到 `INFINI Easysearch` 的接收器插件. ## 使用依赖 > 依赖 [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) > ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) :::提示 支持的引擎 * 支持 [INFINI Easysearch](https://www.infini.com/download/?product=easysearch) 发布的所有版本. ::: ## 数据类型映射 | Easysearch 数据类型 | SeaTunnel 数据类型 | |-----------------------------|----------------------| | STRING
    KEYWORD
    TEXT | STRING | | BOOLEAN | BOOLEAN | | BYTE | BYTE | | SHORT | SHORT | | INTEGER | INT | | LONG | LONG | | FLOAT
    HALF_FLOAT | FLOAT | | DOUBLE | DOUBLE | | Date | LOCAL_DATE_TIME_TYPE | ## 接收器选项 | 名称 | 类型 | 必需 | 默认值 | |------------------------|---------|----|---------------| | hosts | array | 是 | - | | index | string | 是 | - | | primary_keys | list | 否 | | | key_delimiter | string | 否 | `_` | | username | string | 否 | | | password | string | 否 | | | max_retry_count | int | 否 | 3 | | max_batch_size | int | 否 | 10 | | tls_verify_certificate | boolean | 否 | true | | tls_verify_hostname | boolean | 否 | true | | tls_keystore_path | string | 否 | - | | tls_keystore_password | string | 否 | - | | tls_truststore_path | string | 否 | - | | tls_truststore_password | string | 否 | - | | schema_save_mode | enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | enum | 否 | APPEND_DATA | | common-options | | 否 | - | ### hosts [array] `INFINI Easysearch` 集群http地址,格式为 `host:port` , 允许指定多个主机.例如 `["host1:9200", "host2:9200"]`. ### index [string] `INFINI Easysearch` `index` 名称.索引支持包含字段名变量,例如 `seatunnel_${age}`,该字段必须出现在seatunnel行. 如果没有,我们将把它当作一个正常的索引. ### primary_keys [list] 用于生成文档 `_id`的主键字段,这是cdc必需的选项. ### key_delimiter [string] 复合键的分隔符 (默认为"_" ), 例如, "$" 将导致文档 `_id` "KEY1$KEY2$KEY3". ### username [string] 安全用户名 ### password [string] 安全密码 ### max_retry_count [int] 一个批量请求的最大尝试大小 ### max_batch_size [int] 批量文档最大大小 ### tls_verify_certificate [boolean] 为HTTPS端点启用证书验证 ### tls_verify_hostname [boolean] 为HTTPS端点启用主机名验证 ### tls_keystore_path [string] PEM或JKS密钥存储的路径。运行SeaTunnel的操作系统用户必须能够读取此文件. ### tls_keystore_password [string] 指定密钥存储的密钥密码 ### tls_truststore_path [string] PEM或JKS信任存储的路径。运行SeaTunnel的操作系统用户必须能够读取此文件. ### tls_truststore_password [string] 指定的信任存储的密钥密码 ### schema_save_mode [enum] 在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案: - `RECREATE_SCHEMA`:当表不存在时会创建,当表已存在时会删除并重建 - `CREATE_SCHEMA_WHEN_NOT_EXIST`:当表不存在时会创建,当表已存在时则跳过创建 - `ERROR_WHEN_SCHEMA_NOT_EXIST`:当表不存在时将抛出错误 - `IGNORE`:忽略对表的处理 ### data_save_mode [enum] 在启动同步任务之前,针对目标端已有的数据选择不同的处理方案: - `DROP_DATA`:保留数据库结构并删除数据 - `APPEND_DATA`:保留数据库结构,保留数据 - `ERROR_WHEN_DATA_EXISTS`:有数据时报错 ### common options 接收器插件常用参数,详见 [Sink Common Options](../common-options/sink-common-options.md) ## 示例 简单的例子 ```bash sink { Easysearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" } } ``` CDC(变更数据捕获) 事件 ```bash sink { Easysearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" # cdc required options primary_keys = ["key1", "key2", ...] } } ``` SSL (禁用证书验证) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_certificate = false } } ``` SSL (禁用主机名验证) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_hostname = false } } ``` SSL (启用证书验证) ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` 配置表生成策略 ```hocon sink { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Elasticsearch.md ================================================ import ChangeLog from '../changelog/connector-elasticsearch.md'; # Elasticsearch ## 描述 输出数据到 `Elasticsearch` ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) :::tip 引擎支持 * 支持 `ElasticSearch 版本 >= 2.x 并且 <= 8.x` ::: ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | |------------------------|---------|------|------------------------------| | hosts | array | 是 | - | | index | string | 是 | - | | schema_save_mode | string | 是 | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | string | 是 | APPEND_DATA | | index_type | string | 否 | | | primary_keys | list | 否 | | | key_delimiter | string | 否 | `_` | | username | string | 否 | | | password | string | 否 | | | max_retry_count | int | 否 | 3 | | max_batch_size | int | 否 | 10 | | tls_verify_certificate | boolean | 否 | true | | tls_verify_hostname | boolean | 否 | true | | tls_keystore_path | string | 否 | - | | tls_keystore_password | string | 否 | - | | tls_truststore_path | string | 否 | - | | tls_truststore_password | string | 否 | - | | common-options | | 否 | - | | vectorization_fields | array | 否 | - | | vector_dimensions | int | 否 | - | ### hosts [array] `Elasticsearch` 集群http地址,格式为 `host:port` ,允许指定多个主机。例如 `["host1:9200", "host2:9200"]` ### index [string] `Elasticsearch` 的 `index` 名称。索引支持包含字段名变量,例如 `seatunnel_${age}`(需要配置schema_save_mode="IGNORE"),并且该字段必须出现在 seatunnel Row 中。如果没有,我们将把它视为普通索引 ### index_type [string] `Elasticsearch` 索引类型,elasticsearch 6及以上版本建议不要指定 ### primary_keys [list] 主键字段用于生成文档 `_id` ,这是 CDC 必需的选项。 ### key_delimiter [string] 设定复合键的分隔符(默认为 `_`),例如,如果使用 `$` 作为分隔符,那么文档的 `_id` 将呈现为 `KEY1$KEY2$KEY3` 的格式 ### username [string] x-pack 用户名 ### password [string] x-pack 密码 ### max_retry_count [int] 批次批量请求最大尝试大小 ### vectorization_fields [array] 需要向量转换的字段名,Elasticsearch 7.3及以后的版本支持 ### vector_dimensions [int] 向量维度,Elasticsearch 7.3及以后的版本支持 ### max_batch_size [int] 批次批量文档最大大小 ### tls_verify_certificate [boolean] 为 HTTPS 端点启用证书验证 ### tls_verify_hostname [boolean] 为 HTTPS 端点启用主机名验证 ### tls_keystore_path [string] 指向 PEM 或 JKS 密钥存储的路径。运行 SeaTunnel 的操作系统用户必须能够读取此文件 ### tls_keystore_password [string] 指定的密钥存储的密钥密码 ### tls_truststore_path [string] 指向 PEM 或 JKS 信任存储的路径。运行 SeaTunnel 的操作系统用户必须能够读取此文件 ### tls_truststore_password [string] 指定的信任存储的密钥密码 ### common options Sink插件常用参数,请参考 [Sink常用选项](../common-options/sink-common-options.md) 了解详情 ### schema_save_mode 在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
    选项介绍:
    `RECREATE_SCHEMA` :当表不存在时会创建,当表已存在时会删除并重建
    `CREATE_SCHEMA_WHEN_NOT_EXIST` :当表不存在时会创建,当表已存在时则跳过创建
    `ERROR_WHEN_SCHEMA_NOT_EXIST` :当表不存在时将抛出错误
    `IGNORE` :忽略对表的处理
    ### data_save_mode 在启动同步任务之前,针对目标侧已存在的数据选择不同的处理方案
    选项介绍:
    `DROP_DATA`: 保留数据库结构,删除数据
    `APPEND_DATA`:保留数据库结构,保留数据
    `ERROR_WHEN_DATA_EXISTS`:当有数据时抛出错误
    ## 示例 简单示例 ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" schema_save_mode="IGNORE" } } ``` 多表写入 ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" } } ``` 向量转换(vector data) ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" vectorization_fields = ["review_embedding"] vector_dimensions = 1024 } } ``` 变更数据捕获 (Change data capture) 事件 ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-${age}" schema_save_mode="IGNORE" # CDC required options primary_keys = ["key1", "key2", ...] } } ``` ``` 变更数据捕获 (Change data capture) 事件多表写入 ```conf sink { Elasticsearch { hosts = ["localhost:9200"] index = "${table_name}" schema_save_mode="IGNORE" primary_keys = ["${primary_key}"] } } ``` SSL 禁用证书验证 ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false } } ``` SSL 禁用主机名验证 ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_hostname = false } } ``` SSL 启用证书验证 通过设置 `tls_keystore_path` 与 `tls_keystore_password` 指定证书路径及密码 ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` 配置表生成策略 通过设置 `schema_save_mode` 配置为 `CREATE_SCHEMA_WHEN_NOT_EXIST` 来支持不存在表时创建表 ```hocon sink { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ## 模式演变 CDC采集支持有限数量的模式更改。目前支持的模式更改包括: * 添加列。 ### 模式演变 ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "schema_change_index" index_type = "_doc" "schema_save_mode"="CREATE_SCHEMA_WHEN_NOT_EXIST" "data_save_mode"="APPEND_DATA" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Email.md ================================================ import ChangeLog from '../changelog/connector-email.md'; # Email > Email 数据接收器 ## 描述 将接收的数据作为文件发送到电子邮件 ## 支持版本 测试版本:1.5.6(供参考) ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | |--------------------------|---------|------|-----| | email_from_address | string | 是 | - | | email_to_address | string | 是 | - | | email_host | string | 是 | - | | email_transport_protocol | string | 是 | - | | email_smtp_auth | boolean | 是 | - | | email_smtp_port | int | 否 | 465 | | email_authorization_code | string | 否 | - | | email_message_headline | string | 是 | - | | email_message_content | string | 是 | - | | email_attachment_name | string | 否 | emailsink.csv | | email_field_delimiter | string | 否 | , | | common-options | | 否 | - | ### email_from_address [string] 发件人邮箱地址 ### email_to_address [string] 接收邮件的地址,支持多个邮箱地址,以逗号(,)分隔。 ### email_host [string] 连接的SMTP服务器地址 ### email_transport_protocol [string] 加载会话的协议 ### email_smtp_auth [boolean] 是否对客户进行认证 ### email_smtp_port [int] 选择用于身份验证的端口。 ### email_authorization_code [string] 授权码,您可以从邮箱设置中获取授权码 ### email_message_headline [string] 邮件的标题 ### email_message_content [string] 邮件消息的正文 ### email_attachment_name [string] 邮件附件的文件名。默认为 `emailsink.csv`。 ### email_field_delimiter [string] 附件文件中用于分隔字段的分隔符。默认为逗号 `,`。 ### common options Sink插件常用参数,请参考 [Sink常用选项](../common-options/sink-common-options.md) 了解详情. ## 示例 ```bash EmailSink { email_from_address = "xxxxxx@qq.com" email_to_address = "xxxxxx@163.com" email_host="smtp.qq.com" email_transport_protocol="smtp" email_smtp_auth="true" email_authorization_code="" email_message_headline="" email_message_content="" email_attachment_name="report.csv" # 可选,默认为 emailsink.csv email_field_delimiter="|" # 可选,默认为 , } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Enterprise-WeChat.md ================================================ import ChangeLog from '../changelog/connector-http-wechat.md'; # Enterprise WeChat > Enterprise WeChat 接收器连接器 ## 描述 一个使用 Enterprise WeChat 机器人发送消息的接收插件 > 例如,如果来自上游的数据是 [`"alarmStatus": "firing", "alarmTime": "2022-08-03 01:38:49","alarmContent": "The disk usage exceeds the threshold"`], 微信机器人的输出内容如下: > > ``` > alarmStatus: firing > alarmTime: 2022-08-03 01:38:49 > alarmContent: The disk usage exceeds the threshold > ``` > > **小贴士: WeChat 接收器仅支持 `string` 类型 webhook ,源数据将被视为webhook中的正文内容.** ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------------|--------|----|---------------| | url | String | 是 | - | | mentioned_list | array | 否 | - | | mentioned_mobile_list | array | 否 | - | | common-options | | 否 | - | ### url [string] 企业微信网络挂钩 url 格式为 https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=XXXXXX(string) ### mentioned_list [array] 一个用户标识列表,用于提醒组中的指定成员(@A成员),@all意味着提醒每个人。如果开发人员无法获得用户ID,他可以使用called_mobile_list ### mentioned_mobile_list [array] 手机号码列表,提醒群组成员对应的手机号码(@a成员),@all表示提醒大家 ### common options 接收器插件常用参数,详见 [Sink Common Options](../common-options/sink-common-options.md) ## 示例 简单的例子: ```hocon WeChat { url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" } ``` ```hocon WeChat { url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=693axxx6-7aoc-4bc4-97a0-0ec2sifa5aaa" mentioned_list=["wangqing","@all"] mentioned_mobile_list=["13800001111","@all"] } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Feishu.md ================================================ import ChangeLog from '../changelog/connector-http-feishu.md'; # 飞书 > 飞书 数据接收器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [变更数据捕获](../../introduction/concepts/connector-v2-features.md) ## 描述 用于通过数据调用飞书的web hooks。 > 例如,如果来自上游的数据是 [`年龄: 12, 姓名: tyrantlucifer`],则 body 内容如下:`{"年龄": 12, "姓名": "tyrantlucifer"}` **提示:飞书接收器仅支持 `post json`类型的web hook,并且源数据将被视为web hook的正文内容。** ## 数据类型映射 | SeaTunnel 数据类型 | 飞书数据类型 | |-----------------------------|------------| | ROW
    MAP | Json | | NULL | null | | BOOLEAN | boolean | | TINYINT | byte | | SMALLINT | short | | INT | int | | BIGINT | long | | FLOAT | float | | DOUBLE | double | | DECIMAL | BigDecimal | | BYTES | byte[] | | STRING | String | | TIME
    TIMESTAMP
    TIME | String | | ARRAY | JsonArray | ## 接收器选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |----------------|--------|------|-----|------------------------------------------------------------| | url | String | 是 | - | 飞书web hook URL | | headers | Map | 否 | - | HTTP 请求头 | | common-options | | 否 | - | 接收器插件常见参数,请参阅 [接收器通用选项](../common-options/sink-common-options.md) 以获取详细信息 | ## 任务示例 ### 简单示例 ```hocon Feishu { url = "https://www.feishu.cn/flow/api/trigger-webhook/108bb8f208d9b2378c8c7aedad715c19" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Fluss.md ================================================ import ChangeLog from '../changelog/connector-fluss.md'; # Fluss > Fluss 数据接收器 ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 描述 该接收器用于将数据写入到Fluss中。支持批和流两种模式。 ## 依赖 com.alibaba.fluss fluss-client 0.7.0 ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | Description | |-------------------|--------|------|-----|----------------------------------------------------------------------------------| | bootstrap.servers | string | yes | - | fluss 集群地址 | | database | string | no | - | 指定目标 Fluss 表所在的数据库的名称, 如果没有设置该值,则表名与上游库名相同 | | table | string | no | - | 指定目标 Fluss 表的名称, 如果没有设置该值,则表名与上游表名相同 | | client.config | Map | no | - | 设置其他客户端配置. 参考 https://fluss.apache.org/docs/engine-flink/options/#other-options | ### database [string] database选项参数可以填入一任意库名,这个名字最终会被用作目标表的库名,并且支持变量(`${database_name}`,`${schema_name}`)。 替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${database_name}` 将替换传递给目标端的库名。 例如: 1. test_${schema_name}_test 2. sink_sinkdb 3. ss_${database_name} ### table [string] table选项参数可以填入一任意表名,这个名字最终会被用作目标表的表名,并且支持变量(`${table_name}`,`${schema_name}`)。 替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${table_name}` 将替换传递给目标端的表名。 例如: 1. test_${schema_name}_test 2. sink_sinktable 3. ss_${table_name} ## 数据类型映射 | FLuss数据类型 | SeaTunnel数据类型 | |--------------|---------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DOUBLE | DOUBLE | | BYTES | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | TIMESTAMP_TZ | TIMESTAMP_TZ | | STRING | STRING | ## 任务示例 ### 简单示例 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 tables_configs = [ { row.num = 7 schema { table = "test.table1" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] } ] } } transform { } sink { Fluss { bootstrap.servers="fluss_coordinator_e2e:9123" database = "fluss_db_${database_name}" table = "fluss_tb_${table_name}" } } ``` ### 多表写入 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 tables_configs = [ { row.num = 7 schema { table = "test2.table1" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] }, { row.num = 7 schema { table = "test2.table2" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] }, { row.num = 7 schema { table = "test3.table3" fields { fbytes = bytes fboolean = boolean fint = int ftinyint = tinyint fsmallint = smallint fbigint = bigint ffloat = float fdouble = double fdecimal = "decimal(30, 8)" fstring = string fdate = date ftime = time ftimestamp = timestamp ftimestamp_ltz = timestamp_tz } } rows = [ { kind = INSERT fields = ["bWlJWmo=", true, 1940337748, 73, 17489, 7408919466156976747, 9.434991E37, 3.140411637757371E307, 4029933791018936061944.80602290, "aaaaa", "2025-01-03", "02:30:10", "2025-05-27T21:56:09", "2025-09-28T02:54:08+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 90650390, 37, 22504, 5851888708829345169, 2.6221706E36, 1.8915341983748786E307, 3093109630614622831876.71725344, "bbbbb", "2025-01-01", "21:22:44", "2025-05-08T05:26:18", "2025-08-04T16:49:45+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = DELETE fields = ["bWlJWmo=", true, 2146418323, 79, 19821, 6393905306944584839, 2.0462337E38, 1.4868114385836557E308, 5594947262031769994080.35717665, "ccccc", "2025-10-06", "22:10:40", "2025-03-25T01:49:14", "2025-07-03T11:52:06+08:00"] } { kind = INSERT fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_BEFORE fields = ["bWlJWmo=", true, 82794384, 27, 30339, 5826566947079347516, 2.2137477E37, 1.7737681870839753E308, 3984670873242882274814.90739768, "ddddd", "2025-09-13", "10:32:52", "2025-01-27T19:20:51", "2025-11-07T02:38:54+08:00"] } { kind = UPDATE_AFTER fields = ["bWlJWmo=", true, 388742243, 89, 15831, 159071788675312856, 7.310445E37, 1.2166972324288247E308, 7994947075691901110245.55960937, "ddddd", "2025-01-04", "15:28:07", "2025-07-18T08:59:49", "2025-09-12T23:46:25+08:00"] } ] } ] } } transform { } sink { Fluss { bootstrap.servers="fluss_coordinator_e2e:9123" database = "fluss_db_${database_name}" table = "fluss_tb_${table_name}" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/FtpFile.md ================================================ import ChangeLog from '../changelog/connector-file-ftp.md'; # FtpFile > Ftp文件数据接收器连接器 ## 描述 将数据输出到FTP。 :::提示 如果你使用Spark或Flink,为了使用这个连接器,你必须确保你的Spark或Flink集群已经集成了Hadoop。经测试的Hadoop版本是2.x版本。 如果你使用SeaTunnel引擎,在你下载并安装SeaTunnel引擎时,它会自动集成Hadoop的jar包。你可以查看${SEATUNNEL_HOME}/lib目录下的jar包来确认这一点。 ::: ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用两阶段提交(2PC)来确保`精确一次` - [x] 文件格式 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |---------------------------------------|---------|------|--------------------------------------------|---------------------------------------------------------------------------| | host | string | 是 | - | | | port | int | 是 | - | | | user | string | 是 | - | | | password | string | 是 | - | | | path | string | 是 | - | | | tmp_path | string | 是 | /tmp/seatunnel | 结果文件将首先写入一个临时路径,然后使用 `mv` 命令将临时目录提交到目标目录。需要是一个FTP目录。 | | connection_mode | string | 否 | active_local | 目标FTP连接模式 | | remote_verification_enabled | boolean | 否 | true | 是否启用FTP数据通道的远程主机验证 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在 `custom_filename` 为 `true` 时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在 `custom_filename` 为 `true` 时使用 | | file_format_type | string | 否 | "csv" | | | filename_extension | string | 否 | - | 用自定义的文件扩展名覆盖默认的文件扩展名。例如:`.xml`、`.json`、`dat`、`.customtype` | | field_delimiter | string | 否 | '\001' | 仅在 `file_format_type` 为 `text` 时使用 | | row_delimiter | string | 否 | "\n" | 仅在 `file_format_type` 为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 仅在 `have_partition` 为 `true` 时使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅在 `have_partition` 为 `true` 时使用 | | is_partition_field_write_in_file | boolean | 否 | false | 仅在 `have_partition` 为 `true` 时使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是要写入的列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅在 `file_format_type` 为 `excel` 时使用。 | | sheet_name | string | 否 | Sheet${随机数} | 仅在 `file_format_type` 为 `excel` 时使用。 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在 `file_format` 为 `csv` 时使用。 | | xml_root_tag | string | 否 | RECORDS | 仅在 `file_format` 为 `xml` 时使用。 | | xml_row_tag | string | 否 | RECORD | 仅在 `file_format` 为 `xml` 时使用。 | | xml_use_attr_format | boolean | 否 | - | 仅在 `file_format` 为 `xml` 时使用。 | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。当此参数开启时,`batch_size` 将不会生效。输出文件名不会有文件分块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在 `file_format` 为 `parquet` 时使用。 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在 `file_format` 为 `parquet` 时使用。 | | enable_header_write | boolean | 否 | false | 仅在 `file_format_type` 为 `text`、`csv` 时使用。
    `false`:不写入表头,`true`:写入表头。 | | encoding | string | 否 | "UTF-8" | 仅在 `file_format_type` 为 `json`、`text`、`csv`、`xml` 时使用。 | | schema_save_mode | string | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 现有目录处理方法 | | data_save_mode | string | 否 | APPEND_DATA | 现有数据处理方法 | ### host [string] 目标FTP主机是必需的。 ### port [int] 目标FTP端口是必需的。 ### user [string] 目标FTP用户名是必需的。 ### password [string] 目标FTP密码是必需的。 ### path [string] 目标目录路径是必需的。 ### connection_mode [string] 目标 FTP 连接模式是必需的,默认值为主动模式,支持以下几种模式: `active_local`(本地主动模式) `passive_local`(本地被动模式) ### remote_verification_enabled [boolean] 是否启用FTP数据通道的远程主机验证。默认值为 `true`。 ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅当 `custom_filename`为 `true`时使用。 `file_name_expression`描述了将在 `path`中创建的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}`或 `${uuid}`,例如 `test_${uuid}_${now}` 。 `${now}` 表示当前时间,其格式可以通过指定选项 `filename_time_format`来定义。 请注意,如果 `is_enable_transaction`为 `true`,我们将自动在文件名的开头添加 `${transactionId}_`。 ### filename_time_format [string] 仅当 `custom_filename`为 `true`时才会用到。 当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,其默认值为 `yyyy.MM.dd` 。常用的时间格式列举如下: | **代表符号** | 描述 | | ------------ | ------------------ | | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` 请注意,最终的文件名将会以 `file_format_type` 的后缀结尾,文本文件的后缀是 `txt`。 ### field_delimiter [string] 一行数据中各列之间的分隔符。仅 `text`文件格式需要用到。 ### row_delimiter [string] 一行数据中各列之间的分隔符。仅在 `text`、`csv`、`json` 文件格式中需要用到。 ### have_partition [boolean] 你是否需要对分区进行处理。 ### partition_by [array] 仅在 `have_partition` 为 `true` 时才使用。 根据选定的字段对数据进行分区。 ### partition_dir_expression [string] 仅在 `have_partition` 为 `true` 时使用。 若指定了 `partition_by`,我们会根据分区信息生成相应的分区目录,最终文件将被放置在该分区目录中。 默认的 `partition_dir_expression` 为 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。其中,`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 ### is_partition_field_write_in_file [boolean] 仅在 `have_partition` 为 `true` 时使用。 如果 `is_partition_field_write_in_file` 为 `true`,那么分区字段及其对应的值将被写入数据文件中。 例如,如果你想要写入一个 Hive 数据文件,该值(`is_partition_field_write_in_file`)应该设为 `false`。 ### sink_columns [array] 哪些列需要写入文件,默认值是从 `Transform` 或 `Source` 获取的所有列。 字段的顺序决定了实际写入文件时的顺序。 ### is_enable_transaction [boolean] 如果 `is_enable_transaction`为 `true`),我们将确保在数据写入目标目录时不会丢失或重复。 请注意,如果 `is_enable_transaction` 为 `true`,我们将自动在文件名开头添加 `${transactionId}_`。 目前仅支持 `true`这一选项。 ### batch_size [int] 一个文件中的最大行数。对于 SeaTunnel 引擎,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,sink writer 会向一个文件中写入行,直到文件中的行数超过 `batch_size`。如果 `checkpoint.interval` 较小,当新的检查点触发时,sink writer 会创建一个新文件。 ### compress_codec [string] 文件的压缩编解码器及其所支持的详细情况如下: 文件的压缩编解码器以及所支持的详细信息如下所示: - txt:`lzo` `none` - json:`lzo` `none` - csv:`lzo` `none` - orc:`lzo` `snappy` `lz4` `zlib` `none` - parquet:`lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` ` 提示:Excel 类型不支持任何压缩格式。 ### common options Sink 插件的通用参数,请参考[Sink通用选项](../common-options/sink-common-options.md)了解详细信息。 ### max_rows_in_memory [int] 当文件格式为Excel时,可在内存中缓存的数据项的最大数量。 ### sheet_name [string] 写入工作簿的工作表。 ### csv_string_quote_mode [string] 当文件格式为CSV时,CSV的字符串引号模式: - ALL(全部):所有字符串字段都将被加上引号。 - MINIMAL(最少):仅对包含特殊字符(如字段分隔符、引号字符或行分隔字符串中的任何字符)的字段加上引号。 - NONE(无):从不对字段加引号。当数据中出现分隔符时,打印程序会在其前面加上转义字符。如果未设置转义字符,格式验证将抛出异常。 ### xml_root_tag [string] 指定 XML 文件中根元素的标签名称。 ### xml_row_tag [string] 指定 XML 文件中数据行的标签名称。 ### xml_use_attr_format [boolean] 指定是否使用标签属性格式来处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入 Parquet 格式的 INT96 类型数据,仅对 Parquet 文件有效。 ### parquet_avro_write_fixed_as_int96 [array] 支持从一个12字节的字段写入Parquet的INT96类型数据,仅对Parquet文件有效。 ### enable_header_write [boolean] 仅当文件格式类型为文本或CSV时使用。 false:不写入表头,true:写入表头。 ### encoding [string] 仅当文件格式类型为JSON、文本、CSV、XML时才使用。 要写入的文件的编码。此参数将由 `Charset.forName(encoding)` 方法进行解析。 ### schema_save_mode [string] 现有目录处理方法: - RECREATE_SCHEMA(重新创建模式):目录不存在时创建;目录存在时,删除并重新创建。 - CREATE_SCHEMA_WHEN_NOT_EXIST(不存在时创建模式):目录不存在时创建;目录存在时跳过处理 - ERROR_WHEN_SCHEMA_NOT_EXIST(模式不存在时出错):目录不存在时报告错误。 - IGNORE(忽略):忽略对该表的处理。 ### data_save_mode [string] 现有数据处理方法: - DROP_DATA(删除数据):保留目录,删除数据文件。 - APPEND_DATA(追加数据):保留目录和数据文件。 - ERROR_WHEN_DATA_EXISTS(数据存在时报错):当存在数据文件时,报告错误。 ## 示例 对于文本文件格式的简易配置 ```bash FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" sink_columns = ["name","age"] } ``` 对于带有 `have_partition`、`custom_filename` 和 `sink_columns` 的文本文件格式 ```bash FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp/seatunnel/job1" tmp_path = "/data/ftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" sink_columns = ["name","age"] filename_time_format = "yyyy.MM.dd" } ``` 当我们的数据源端是多个表,并且希望将不同的数据按照不同的表达式存储到不同的目录时,我们可以按照这种方式进行配置。 ```hocon FtpFile { host = "xxx.xxx.xxx.xxx" port = 21 user = "username" password = "password" path = "/data/ftp/seatunnel/job1/${table_name}" tmp_path = "/data/ftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" sink_columns = ["name","age"] filename_time_format = "yyyy.MM.dd" schema_save_mode=RECREATE_SCHEMA data_save_mode=DROP_DATA } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/GoogleFirestore.md ================================================ import ChangeLog from '../changelog/connector-google-firestore.md'; # GoogleFirestore > Google Firestore Sink 连接器 ## 描述 将数据写入 Google Firestore ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-------------|--------|------|--------| | project_id | string | 是 | - | | collection | string | 是 | - | | credentials | string | 否 | - | ### project_id [string] Google Firestore 数据库项目的唯一标识符。 ### collection [string] Google Firestore 的集合。 ### credentials [string] Google Cloud 服务账户的凭证,使用 base64 编码。如果未设置,需要检查 `GOOGLE_APPLICATION_CREDENTIALS` 环境变量是否存在。 ### 通用选项 Sink 插件通用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 了解详情。 ## 示例 ```bash GoogleFirestore { project_id = "dummy-project-id", collection = "dummy-collection", credentials = "dummy-credentials" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/GraphQL.md ================================================ import ChangeLog from '../changelog/connector-graphql.md'; # GraphQL > GraphQL sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [[精确一次]](../../introduction/concepts/connector-v2-features.md) - [ ] [变更数据捕获](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 描述 接收Source端传入的数据,利用数据触发 web hooks。 > 例如,来自上游的数据为 [`label: {"__name__": "test1"}, value: 1.2.3,time:2024-08-15T17:00:00`], 则body内容如下: `{"label":{"__name__": "test1"}, "value":"1.23","time":"2024-08-15T17:00:00"}` **Tips: GraphQL 数据接收器 仅支持 `post json` 类型的 web hook,source 数据将被视为 webhook 中的 body 内容。并且不支持传递过去太久的数据** ## 支持的数据源信息 想使用 GraphQL 连接器,需要安装以下必要的依赖。可以通过运行 install-plugin.sh 脚本或者从 Maven 中央仓库下载这些依赖 | 数据源 | 支持版本 | 依赖 | | ------ | --------- | ------------------------------------------------------------ | | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-http) | ## 接收器选项 | Name | Type | Required | Default | Description | |-----------------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------| | url | String | Yes | - | Http request url | | query | String | Yes | - | GraphQL query | | variables | String | No | - | GraphQL variables | | valueCover | Boolean | No | - | Whether the data overwrites the variable value | | headers | Map | No | - | Http headers | | retry | Int | No | - | The max retry times if request http return to `IOException` | | retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed | | retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | | connect_timeout_ms | Int | No | 12000 | Connection timeout setting, default 12s. | | socket_timeout_ms | Int | No | 60000 | Socket timeout setting, default 60s. | | common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](../sink-common-options.md) for details | ## 示例 简单示例: ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "graphql_sink_1" fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] }, { schema = { table = "graphql_sink_2" fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] } ] } } sink { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" query = """ mutation MyMutation( $id: Int! $val_bool: Boolean! $val_int8: smallint! $val_int16: smallint! $val_int32: Int! $val_int64: bigint! $val_float: Float! $val_double: Float! $val_decimal: numeric! $val_string: String! $val_unixtime_micros: timestamp! ) { insert_sink(objects: { id: $id, val_bool: $val_bool, val_int8: $val_int8, val_int16: $val_int16, val_int32: $val_int32, val_int64: $val_int64, val_float: $val_float, val_double: $val_double, val_decimal: $val_decimal, val_string: $val_string, val_unixtime_micros: $val_unixtime_micros }) { affected_rows returning { id val_bool val_decimal val_double val_float val_int16 val_int32 val_int64 val_int8 val_string val_unixtime_micros } } } """ variables = { "val_bool": True } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Greenplum.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Greenplum > Greenplum Sink 连接器 ## 描述 使用 [JDBC 连接器](Jdbc.md) 将数据写入 Greenplum。 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) :::tip 不支持精确一次语义(Greenplum 数据库尚不支持 XA 事务)。 ::: ## 选项 ### driver [string] 可选的 JDBC 驱动程序: - `org.postgresql.Driver` - `com.pivotal.jdbc.GreenplumDriver` 警告:为了符合许可证要求,如果您使用 `GreenplumDriver`,则必须自己提供 Greenplum JDBC 驱动程序,例如将 greenplum-xxx.jar 复制到 $SEATUNNEL_HOME/lib(用于独立模式)。 ### url [string] JDBC 连接的 URL。如果使用 PostgreSQL 驱动程序,值为 `jdbc:postgresql://${yous_host}:${yous_port}/${yous_database}`,或者如果使用 Greenplum 驱动程序,值为 `jdbc:pivotal:greenplum://${yous_host}:${yous_port};DatabaseName=${yous_database}` ### 通用选项 Sink 插件通用参数,请参考 [Sink 通用选项](../common-options/sink-common-options.md) 详见。 ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Hbase.md ================================================ import ChangeLog from '../changelog/connector-hbase.md'; # Hbase > Hbase 数据连接器 ## 描述 将数据输出到hbase ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | |--------------------|---------|------|-----------------| | zookeeper_quorum | string | yes | - | | table | string | yes | - | | rowkey_column | list | yes | - | | family_name | config | yes | - | | rowkey_delimiter | string | no | "" | | version_column | string | no | - | | null_mode | string | no | skip | | wal_write | boolean | yes | false | | write_buffer_size | string | no | 8 * 1024 * 1024 | | encoding | string | no | utf8 | | hbase_extra_config | config | no | - | | common-options | | no | - | | ttl | long | no | - | ### zookeeper_quorum [string] hbase的zookeeper集群主机, 示例: "hadoop001:2181,hadoop002:2181,hadoop003:2181" ### table [string] 要写入的表名, 例如: "seatunnel" 如果表在自定义 namespace 下,请使用 `namespace:table` 形式(如 `ns1:seatunnel_test`);未填写 namespace 时,SeaTunnel 会写入到 HBase 默认命名空间 `default`。 ### rowkey_column [list] 行键的列名列表, 例如: ["id", "uuid"] ### family_name [config] 字段的列簇名称映射。例如,上游的行如下所示: | id | name | age | |----|---------------|-----| | 1 | tyrantlucifer | 27 | id作为行键和其他写入不同列簇的字段,可以分配 family_name { name = "info1" age = "info2" } 这主要是name写入列簇info1,age写入将写给列簇 info2 如果要将其他字段写入同一列簇,可以分配 family_name { all_columns = "info" } 这意味着所有字段都将写入该列簇 info ### rowkey_delimiter [string] 连接多行键的分隔符,默认 "" ### version_column [string] 版本列名称,您可以使用它来分配 hbase 记录的时间戳 ### null_mode [double] 写入 null 值的模式,支持 [ skip , empty], 默认 skip - skip: 当字段为 null ,连接器不会将此字段写入 hbase - empty: 当字段为null时,连接器将写入并为此字段生成空值 ### wal_write [boolean] wal log 写入标志,默认值 false ### write_buffer_size [int] hbase 客户端的写入缓冲区大小,默认 8 * 1024 * 1024 ### encoding [string] 字符串类字段的编码(STRING/DECIMAL/DATE/TIME/TIMESTAMP/ARRAY),支持 [utf8, gbk],默认 utf8 ### 数据类型 Hbase 存储字节,连接器支持: - TINYINT/SMALLINT/INT/BIGINT/FLOAT/DOUBLE/BOOLEAN/BYTES - STRING/DECIMAL/DATE/TIME/TIMESTAMP/ARRAY(使用 encoding 序列化为字符串后写入) ### hbase_extra_config [config] hbase扩展配置 ### ttl [long] hbase 写入数据 TTL 时间,默认以表设置的TTL为准,单位毫秒 ### 常见选项 Sink 插件常用参数,详见 Sink 常用选项 [Sink Common Options](../common-options/sink-common-options.md) ## 案例 ```hocon Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "seatunnel_test" rowkey_column = ["name"] family_name { all_columns = seatunnel } } ``` ## Kerberos 示例 备注: - `connector-hbase` 不会解析 `krb5_path` / `kerberos_principal` / `kerberos_keytab_path`。 - 需要在运行环境中提前完成 Kerberos 登录并保证 `krb5.conf` 可被 JVM 访问(例如 `kinit -kt ...` 或 JVM `-Djava.security.krb5.conf=...`),同时将 HBase/Hadoop 的安全配置写入 `hbase_extra_config`。 ```hocon sink { Hbase { zookeeper_quorum = "zk1:2181,zk2:2181,zk3:2181" table = "target_table" rowkey_column = ["rowkey"] family_name { all_columns = "info" } # HBase安全配置 hbase_extra_config = { "hbase.security.authentication" = "kerberos" "hadoop.security.authentication" = "kerberos" "hbase.master.kerberos.principal" = "hbase/_HOST@REALM" "hbase.regionserver.kerberos.principal" = "hbase/_HOST@REALM" "hbase.rpc.protection" = "authentication" "hbase.zookeeper.useSasl" = "false" } } } ``` ### 写入多表 ```hocon env { # You can set engine configuration here execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "hbase_sink_1" fields { name = STRING c_string = STRING c_double = DOUBLE c_bigint = BIGINT c_float = FLOAT c_int = INT c_smallint = SMALLINT c_boolean = BOOLEAN time = BIGINT } } rows = [ { kind = INSERT fields = ["label_1", "sink_1", 4.3, 200, 2.5, 2, 5, true, 1627529632356] } ] }, { schema = { table = "hbase_sink_2" fields { name = STRING c_string = STRING c_double = DOUBLE c_bigint = BIGINT c_float = FLOAT c_int = INT c_smallint = SMALLINT c_boolean = BOOLEAN time = BIGINT } } rows = [ { kind = INSERT fields = ["label_2", "sink_2", 4.3, 200, 2.5, 2, 5, true, 1627529632357] } ] } ] } } sink { Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "${table_name}" rowkey_column = ["name"] family_name { all_columns = info } } } ``` ## 写入指定列族 ```hocon Hbase { zookeeper_quorum = "hbase_e2e:2181" table = "assign_cf_table" rowkey_column = ["id"] family_name { c_double = "cf1" c_bigint = "cf2" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/HdfsFile.md ================================================ import ChangeLog from '../changelog/connector-file-hadoop.md'; # Hdfs文件 > Hdfs文件 数据接收器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC提交来确保"精确一次" - [x] 文件格式类型 - [x] 文本 - [x] CSV - [x] Parquet - [x] ORC - [x] JSON - [x] Excel - [x] canal_json - [x] debezium_json - [x] maxwell_json - [x] 压缩编解码器 - [x] lzo ## 描述 将数据输出到Hdfs文件 ## 支持的数据源信息 | 数据源 | 支持的版本 | |--------|------------------| | Hdfs文件 | hadoop 2.x 和 3.x | ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------------------------|---------|------|--------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | fs.defaultFS | string | 是 | - | Hadoop 集群地址。支持以下格式:
    - 标准 HDFS:`hdfs://hadoopcluster` 或 `hdfs://namenode:9000`
    - ViewFS(联邦 HDFS):`viewfs://mycluster`
    详见下方 ViewFS 配置示例。 | | path | string | 是 | - | 目标目录路径是必需的。 | | tmp_path | string | 是 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用 `mv` 命令将临时目录提交到目标目录。需要一个Hdfs路径。 | | hdfs_site_path | string | 否 | - | `hdfs-site.xml` 的路径,用于加载 namenodes 的 ha 配置。 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在 `custom_filename` 为 `true` 时使用。`file_name_expression` 描述将创建到 `path` 中的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}` 或 `${uuid}`,例如 `test_${uuid}_${now}`,`${now}` 表示当前时间,其格式可以通过指定选项 `filename_time_format` 来定义。请注意,如果 `is_enable_transaction` 为 `true`,我们将在文件头部自动添加 `${transactionId}_`。 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在 `custom_filename` 为 `true` 时使用。当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下所示:[y:年,M:月,d:月中的一天,H:一天中的小时(0-23),m:小时中的分钟,s:分钟中的秒] | | file_format_type | string | 否 | "csv" | 我们支持以下文件类型:`text` `json` `csv` `orc` `parquet` `excel` `canal_json` `debezium_json` `maxwell_json`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | | filename_extension | string | 否 | - | 使用自定义的文件扩展名覆盖默认的文件扩展名。 例如:`.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | 否 | '\001' | 仅在 file_format 为 text 时使用,数据行中列之间的分隔符。仅需要 `text` 文件格式。 | | row_delimiter | string | 否 | "\n" | 仅在 file_format 为 text 时使用,文件中行之间的分隔符。仅需要 `text`、`csv`、`json` 文件格式。 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 仅在 have_partition 为 true 时使用,根据选定的字段对数据进行分区。 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅在 have_partition 为 true 时使用,如果指定了 `partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。默认 `partition_dir_expression` 为 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 | | is_partition_field_write_in_file | boolean | 否 | false | 仅当 `have_partition` 为 `true` 时使用。如果 `is_partition_field_write_in_file` 为 `true`,则分区字段及其值将写入数据文件中。例如,如果要写入Hive数据文件,则其值应为 `false`。 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是接收器列。需要写入文件的列,默认值是从 `Transform` 或 `Source` 获取的所有列。字段的顺序确定了实际写入文件时的顺序。 | | is_enable_transaction | boolean | 否 | true | 如果 `is_enable_transaction` 为 true,则在将数据写入目标目录时,我们将确保数据不会丢失或重复。请注意,如果 `is_enable_transaction` 为 `true`,我们将在文件头部自动添加 `${transactionId}_`。目前仅支持 `true`。 | | batch_size | int | 否 | 1000000 | 文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,则接收器写入器将在文件中写入行,直到文件中的行大于 `batch_size`。如果 `checkpoint.interval` 很小,则接收器写入器将在新检查点触发时创建一个新文件。 | | single_file_mode | boolean | 否 | false | 每个并行度只会输出一个文件,当此参数开启时,batch_size就不会生效。输出的文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,依然生成对应的数据文件。 | | compress_codec | string | 否 | none | 文件的压缩编解码器及其支持的细节如下所示:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`]。提示:excel类型不支持任何压缩格式。 | | krb5_path | string | 否 | /etc/krb5.conf | kerberos 的 krb5 路径 | | kerberos_principal | string | 否 | - | kerberos 的主体 | | kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径 | | compress_codec | string | 否 | none | 压缩编解码器 | | common-options | object | 否 | - | 接收器插件通用参数,请参阅 [接收器通用选项](../common-options/sink-common-options.md) 了解详情 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在文件格式为 CSV 时使用。 | | enable_header_write | boolean | 否 | false | 仅在 file_format_type 为 text,csv 时使用。
    false:不写入表头,true:写入表头。 | | max_rows_in_memory | int | 否 | - | 仅当 file_format 为 excel 时使用。当文件格式为 Excel 时,可以缓存在内存中的最大数据项数。 | | sheet_name | string | 否 | Sheet${Random number} | 仅当 file_format 为 excel 时使用。将工作簿的表写入指定的表名 | | remote_user | string | 否 | - | Hdfs的远端用户名。 | | schema_save_mode | string | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 现有目录处理方式 | | data_save_mode | string | 否 | APPEND_DATA | 现有数据处理方式 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### 提示 > 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已经集成了 hadoop。测试过的 hadoop 版本是 > 2.x。如果您使用 SeaTunnel Engine,则在下载和安装 SeaTunnel Engine 时会自动集成 hadoop > jar。您可以检查 `${SEATUNNEL_HOME}/lib` 下的 jar 包来确认这一点。 ### schema_save_mode [string] 现有的目录处理方法。 - RECREATE_SCHEMA:当目录不存在时创建,当目录存在时删除并重新创建 - CREATE_SCHEMA_WHEN_NOT_EXIST:当目录不存在时创建,当目录存在时跳过 - ERROR_WHEN_SCHEMA_NOT_EXIST:当目录不存在时,将报告错误 - IGNORE:忽略对表的处理 ### data_save_mode [string] 现有的数据处理方法。 - DROP_DATA:保留目录并删除数据文件 - APPEND_DATA:保留目录,保留数据文件 - ERROR_WHEN_DATA_EXISTS:当有数据文件时,会报告错误 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 任务示例 ### 简单示例 > 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 Hdfs。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件 **仅用于测试和演示功能源插件** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的源端插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` ### orc 文件格式的简单配置 ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } ``` ### text 文件格式的配置,包括 `have_partition`、`custom_filename` 和 `sink_columns` ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### parquet 文件格式的配置,包括 `have_partition`、`custom_filename` 和 `sink_columns` ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" file_format_type = "parquet" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### enable_header_write [boolean] 仅在 file_format_type 为 text,csv 时使用。false:不写入表头,true:写入表头。 ### csv_string_quote_mode [string] 当文件格式为 CSV 时,CSV 的字符串引号模式。 - ALL:所有字符串字段都会加引号。 - MINIMAL:仅为包含特殊字符(如字段分隔符、引号字符或行分隔符字符串中的任何字符)的字段加引号。 - NONE:从不为字段加引号。当数据中包含分隔符时,输出会在前面加上转义字符。如果未设置转义字符,则格式验证会抛出异常。 ### kerberos 的简单配置 ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" hdfs_site_path = "/path/to/your/hdfs_site_path" kerberos_principal = "your_principal@EXAMPLE.COM" kerberos_keytab_path = "/path/to/your/keytab/file.keytab" } ``` ### 压缩的简单配置 ``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" compress_codec = "lzo" } ``` ### ViewFS(联邦 HDFS)配置示例 ViewFS 允许您将多个 HDFS 集群或命名空间统一到一个逻辑命名空间中。这对于 HDFS 联邦(Federation)场景非常有用。 ``` HdfsFile { fs.defaultFS = "viewfs://mycluster" path = "/data/output" file_format_type = "parquet" hdfs_site_path = "/path/to/core-site.xml" data_save_mode = "DROP_DATA" } ``` 在 `core-site.xml` 中配置挂载表: ```xml fs.viewfs.mounttable.mycluster.link./data hdfs://namenode1:9000/data fs.viewfs.mounttable.mycluster.link./logs hdfs://namenode2:9000/logs fs.viewfs.mounttable.mycluster.link./tmp hdfs://namenode3:9000/tmp ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Hive.md ================================================ import ChangeLog from '../changelog/connector-hive.md'; # Hive > Hive Sink 连接器 ## 描述 将数据写入 Hive。 :::tip 提示 为了使用此连接器,您必须确保您的 Spark/Flink 集群已经集成了 Hive。测试过的 Hive 版本是 2.3.9 和 3.1.3。 如果您使用 SeaTunnel 引擎,您需要将 `seatunnel-hadoop3-3.1.4-uber.jar`、`hive-exec-3.1.3.jar` 和 `libfb303-0.9.3.jar` 放在 `$SEATUNNEL_HOME/lib/` 目录中。 ::: ## 关键特性 - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用 2PC 提交来确保“精确一次”。 - [x] 文件格式 - [x] 文本 - [x] CSV - [x] Parquet - [x] ORC - [x] JSON - [x] 压缩编解码器 - [x] LZO ## 选项 | 名称 | 类型 | 必需 | 默认值 | |---------------------------------------|---------|----|----------------| | table_name | string | 是 | - | | metastore_uri | string | 是 | - | | compress_codec | string | 否 | none | | hdfs_site_path | string | 否 | - | | hive_site_path | string | 否 | - | | hive.hadoop.conf | Map | 否 | - | | hive.hadoop.conf-path | string | 否 | - | | krb5_path | string | 否 | /etc/krb5.conf | | kerberos_principal | string | 否 | - | | kerberos_keytab_path | string | 否 | - | | abort_drop_partition_metadata | boolean | 否 | false | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | | overwrite | boolean | 否 | false | | data_save_mode | enum | 否 | APPEND_DATA | | schema_save_mode | enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | | save_mode_create_template | string | 否 | - | | common-options | | 否 | - | ### table_name [string] 目标 Hive 表名,例如:`db1.table1`。如果源是多模式,您可以使用 `${database_name}.${table_name}` 来生成表名,它将用源生成的 CatalogTable 的值替换 `${database_name}` 和 `${table_name}`。 ### metastore_uri [string] Hive 元存储 URI。支持通过逗号分隔配置多个 URI 用于高可用/故障切换(会自动去除空格)。SeaTunnel 会将该值写入 Hive 的 `hive.metastore.uris`,并在运行时优先使用 Hive 的 `RetryingMetaStoreClient` 实现重试/切换。注意:该能力仅做客户端连接端点切换,元数据一致性需要由 metastore 部署保证。 ### hdfs_site_path [string] `hdfs-site.xml` 的路径,用于加载 Namenode 的高可用配置 ### hive_site_path [string] `hive-site.xml` 的路径 ### hive.hadoop.conf [map] Hadoop 配置中的属性(`core-site.xml`、`hdfs-site.xml`、`hive-site.xml`) ### hive.hadoop.conf-path [string] 指定加载 `core-site.xml`、`hdfs-site.xml`、`hive-site.xml` 文件的路径 ### krb5_path [string] `krb5.conf` 的路径,用于 Kerberos 认证 `hive-site.xml` 的路径,用于 Hive 元存储认证 ### kerberos_principal [string] Kerberos 的主体 ### kerberos_keytab_path [string] Kerberos 的 keytab 文件路径 ### abort_drop_partition_metadata [boolean] 在中止操作期间是否从 Hive Metastore 中删除分区元数据的标志。注意:这只影响元存储中的元数据,分区中的数据将始终被删除(同步过程中生成的数据)。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入 Parquet INT96,仅对 parquet 文件有效。 ### overwrite [boolean] 是否以覆盖写入(Overwrite)方式写入 Hive。 - 批模式(BATCH):在提交前删除目标路径中已有数据(非分区表删除表目录;分区表删除本次提交涉及的分区目录),再写入新数据。 - 流模式(STREAMING):在启用 checkpoint 的流式运行时,commit 会在每个 checkpoint 完成后触发一次。为避免每个 checkpoint 都重复删除导致数据丢失,SeaTunnel 会对每个目标目录(表目录/分区目录)最多删除一次(空提交会跳过删除)。恢复(recovery)场景下为避免误删已提交数据,删除行为为 best-effort,可能会被跳过,因此不保证严格的“全量覆盖”语义。 ### data_save_mode [enum] 在写入数据前,选择如何处理目标端已有数据: - APPEND_DATA(默认):保留既有数据并追加写入 - DROP_DATA:与 overwrite=true 等价。在提交前删除目标路径中已有数据(非分区表删除表目录;分区表删除相关分区目录),再写入新数据 - CUSTOM_PROCESSING / ERROR_WHEN_DATA_EXISTS:如无特殊需求,不建议在 Hive sink 下使用 注意:overwrite=true 与 data_save_mode=DROP_DATA 行为等价,二者择一配置即可,勿同时设置。 ### schema_save_mode [枚举] 在开始同步任务之前,针对目标端已存在的表结构选择不同的处理方案。 **默认值**: `CREATE_SCHEMA_WHEN_NOT_EXIST` 选项值: - `RECREATE_SCHEMA`: 表不存在时会创建,表存在时会删除并重建 - `CREATE_SCHEMA_WHEN_NOT_EXIST`: 表不存在时会创建,表存在时会跳过 - `ERROR_WHEN_SCHEMA_NOT_EXIST`: 表不存在时会报错 - `IGNORE`: 忽略对表的处理 ### save_mode_create_template [字符串] 我们使用模板来自动创建 Hive 表,它将根据上游数据类型和模式类型创建相应的建表语句,默认模板可以根据情况进行修改。可用的模板变量:${database}, ${table}, ${rowtype_fields}, ${rowtype_partition_fields}, ${table_location}。 **默认值**: 当未指定时,使用默认的 PARQUET 非分区表模板: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_fields} ) STORED AS PARQUET LOCATION '${table_location}' ``` ### 通用选项 Sink 插件的通用参数,请参阅 [Sink Common Options](../common-options/sink-common-options.md) 了解详细信息。 ## 示例 ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://namenode001:9083" } ``` metastore_uri 故障切换示例(多 URI): ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" } ``` ### 示例 1 我们有一个源表如下: ```bash create table test_hive_source( test_tinyint TINYINT, test_smallint SMALLINT, test_int INT, test_bigint BIGINT, test_boolean BOOLEAN, test_float FLOAT, test_double DOUBLE, test_string STRING, test_binary BINARY, test_timestamp TIMESTAMP, test_decimal DECIMAL(8,2), test_char CHAR(64), test_varchar VARCHAR(64), test_date DATE, test_array ARRAY, test_map MAP, test_struct STRUCT ) PARTITIONED BY (test_par1 STRING, test_par2 STRING); ``` 我们需要从源表读取数据并写入另一个表: ```bash create table test_hive_sink_text_simple( test_tinyint TINYINT, test_smallint SMALLINT, test_int INT, test_bigint BIGINT, test_boolean BOOLEAN, test_float FLOAT, test_double DOUBLE, test_string STRING, test_binary BINARY, test_timestamp TIMESTAMP, test_decimal DECIMAL(8,2), test_char CHAR(64), test_varchar VARCHAR(64), test_date DATE ) PARTITIONED BY (test_par1 STRING, test_par2 STRING); ``` 作业配置文件可以如下: ``` env { parallelism = 3 job.name="test_hive_source_to_hive" } source { Hive { table_name = "test_hive.test_hive_source" metastore_uri = "thrift://ctyun7:9083" } } sink { # 选择 stdout 输出插件将数据输出到控制台 Hive { table_name = "test_hive.test_hive_sink_text_simple" metastore_uri = "thrift://ctyun7:9083" hive.hadoop.conf = { bucket = "s3a://mybucket" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } ``` ### 示例 2:Kerberos ```bash sink { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` 描述: - `hive_site_path`:`hive-site.xml` 文件的路径。 - `kerberos_principal`:Kerberos 认证的主体。 - `kerberos_keytab_path`:Kerberos 认证的 keytab 文件路径。 - `krb5_path`:用于 Kerberos 认证的 `krb5.conf` 文件路径。 运行案例: ```bash env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` ## Hive on s3 ### 步骤 1 为 EMR 的 Hive 创建 lib 目录。 ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 2 从 Maven 中心获取 jar 文件到 lib。 ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### 步骤 3 从您的 EMR 环境中复制 jar 文件到 lib 目录。 ```shell cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 4 运行案例。 ```shell env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_s3" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } } ``` ## Hive on oss ### 步骤 1 为 EMR 的 Hive 创建 lib 目录。 ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 2 从 Maven 中心获取 jar 文件到 lib。 ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### 步骤 3 从您的 EMR 环境中复制 jar 文件到 lib 目录并删除冲突的 jar。 ```shell cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar ``` ### 步骤 4 运行案例。 ```shell env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] } ] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_oss" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } ``` ### 示例 2 我们有多个源表如下: ```bash create table test_1( ) PARTITIONED BY (xx); create table test_2( ) PARTITIONED BY (xx); ... ``` 我们需要从这些源表读取数据并写入其他表: 作业配置文件可以如下: ``` env { # 您可以在此处设置 Flink 配置 parallelism = 3 job.name="test_hive_source_to_hive" } source { Hive { tables_configs = [ { table_name = "test_hive.test_1" metastore_uri = "thrift://ctyun6:9083" }, { table_name = "test_hive.test_2" metastore_uri = "thrift://ctyun7:9083" } ] } } sink { # 选择 stdout 输出插件将数据输出到控制台 Hive { table_name = "${database_name}.${table_name}" metastore_uri = "thrift://ctyun7:9083" } } ``` ## 自动建表示例 ### 示例 1:基础自动建表 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { id = bigint name = string department = string salary = decimal(10,2) hire_date = date } } rows = [ { kind = INSERT fields = [1, "张三", "工程部", 75000.50, "2022-01-15"] } ] } } sink { Hive { table_name = "warehouse.employees" metastore_uri = "thrift://metastore:9083" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" save_mode_create_template = """ CREATE TABLE IF NOT EXISTS `${database}`.`${table}` ( ${rowtype_fields} ) PARTITIONED BY ( department string COMMENT '部门分区' ) STORED AS PARQUET LOCATION '${table_location}' """ } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Http.md ================================================ import ChangeLog from '../changelog/connector-http.md'; # Http > Http 数据接收器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 接收Source端传入的数据,利用数据触发 web hooks。 > 例如,来自上游的数据为[`age: 12, name: tyrantlucifer`],则body内容如下:`{"age": 12, "name": "tyrantlucifer"}` **Tips: Http 接收器仅支持 `post json` 类型的 web hook,source 数据将被视为 webhook 中的 body 内容。** ## 支持的数据源信息 想使用 Http 连接器,需要安装以下必要的依赖。可以通过运行 install-plugin.sh 脚本或者从 Maven 中央仓库下载这些依赖 | 数据源 | 支持版本 | 依赖 | |------|------|------------------------------------------------------------------------------| | Http | 通用 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-http) | ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |-----------------------------|--------|------|-------|------------------------------------------------------------| | url | String | 是 | - | Http 请求链接 | | headers | Map | 否 | - | Http 标头 | | retry | Int | 否 | - | 如果请求http返回`IOException`的最大重试次数 | | retry_backoff_multiplier_ms | Int | 否 | 100 | http请求失败,重试回退次数(毫秒)乘数 | | retry_backoff_max_ms | Int | 否 | 10000 | http请求失败,最大重试回退时间(毫秒) | | connect_timeout_ms | Int | 否 | 12000 | 连接超时设置,默认12s | | socket_timeout_ms | Int | 否 | 60000 | 套接字超时设置,默认为60s | | array_mode | Boolean| 否 | false | 为true时将数据作为JSON数组发送,为false时作为单个JSON对象发送(默认) | | batch_size | Int | 否 | 1 | 在一个HTTP请求中发送的记录批量大小。仅在array_mode为true时有效 | | request_interval_ms | Int | 否 | 0 | 两次HTTP请求之间的间隔毫秒数,以避免请求过于频繁 | | common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项 ](../common-options/sink-common-options.md) 了解详情 | ## 示例 简单示例: ```hocon Http { url = "http://localhost/test/webhook" headers { token = "9e32e859ef044462a257e1fc76730066" } } ``` ### 带批处理的示例 ```hocon Http { url = "http://localhost/test/webhook" headers { token = "9e32e859ef044462a257e1fc76730066" Content-Type = "application/json" } array_mode = true batch_size = 50 request_interval_ms = 500 } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Hudi.md ================================================ import ChangeLog from '../changelog/connector-hudi.md'; # Hudi > Hudi 接收器连接器 ## 描述 用于将数据写入 Hudi。 ## 主要特点 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## 选项 基础配置: | 名称 | 名称 | 是否必需 | 默认值 | |----------------------------|--------|------ |------------------------------| | table_dfs_path | string | 是 | - | | conf_files_path | string | 否 | - | | table_list | string | 否 | - | | schema_save_mode | enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | | common-options | config | 否 | - | 表清单配置: | 名称 | 类型 | 是否必需 | 默认值 | |----------------------------|--------|----------|---------------| | table_name | string | yes | - | | database | string | no | default | | table_type | enum | no | COPY_ON_WRITE | | op_type | enum | no | insert | | record_key_fields | string | no | - | | partition_fields | string | no | - | | precombine_field | string | no | - | | batch_interval_ms | Int | no | 1000 | | batch_size | Int | no | 1000 | | insert_shuffle_parallelism | Int | no | 2 | | upsert_shuffle_parallelism | Int | no | 2 | | min_commits_to_keep | Int | no | 20 | | max_commits_to_keep | Int | no | 30 | | index_type | enum | no | BLOOM | | index_class_name | string | no | - | | record_byte_size | Int | no | 1024 | | cdc_enabled | boolean| no | false | 注意: 当此配置对应于单个表时,您可以将table_list中的配置项展平到外层。 ### table_name [string] `table_name` Hudi 表的名称。 ### database [string] `database` Hudi 表的database. ### table_dfs_path [string] `table_dfs_path` Hudi 表的 DFS 根路径,例如 "hdfs://nameservice/data/hudi/"。 ### table_type [enum] `table_type` Hudi 表的类型。 ### record_key_fields [string] `record_key_fields` Hudi 表的记录键字段, 当op_type是`UPSERT`类型时, 必须配置该项. ### partition_fields [string] `partition_fields` Hudi 表的分区字段. ### precombine_field [string] `precombine_field` Hudi 表的预合并字段,它用于在写入前进行预合并. ### index_type [string] `index_type` Hudi 表的索引类型. 当前只支持`BLOOM`, `SIMPLE`, `GLOBAL SIMPLE`三种类型. ### index_class_name [string] `index_class_name` Hudi 表自定义索引名称,例如: `org.apache.seatunnel.connectors.seatunnel.hudi.index.CustomHudiIndex`. ### record_byte_size [Int] `record_byte_size` Hudi 表单行记录的大小, 该值可用于预估每个hudi数据文件中记录的大致数量。调整此参数与`batch_size`可以有效减少hudi数据文件写放大次数. ### conf_files_path [string] `conf_files_path` 环境配置文件路径列表(本地路径),用于初始化 HDFS 客户端以读取 Hudi 表文件。示例:"/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml"。 ### op_type [enum] `op_type` Hudi 表的操作类型。值可以是 `insert`、`upsert` 或 `bulk_insert`。 ### batch_interval_ms [Int] `batch_interval_ms` 批量写入 Hudi 表的时间间隔。 ### batch_size [Int] `batch_size` 批量写入 Hudi 表的记录数大小. ### insert_shuffle_parallelism [Int] `insert_shuffle_parallelism` 插入数据到 Hudi 表的并行度。 ### upsert_shuffle_parallelism [Int] `upsert_shuffle_parallelism` 更新插入数据到 Hudi 表的并行度。 ### min_commits_to_keep [Int] `min_commits_to_keep` Hudi 表保留的最少提交数。 ### max_commits_to_keep [Int] `max_commits_to_keep` Hudi 表保留的最多提交数。 ### cdc_enabled [boolean] `cdc_enabled` 是否持久化Hudi表的CDC变更日志。启用后,在必要时持久化更改数据,表可以作为CDC模式进行查询. ### schema_save_mode [Enum] 在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
    选项介绍:
    `RECREATE_SCHEMA`:当表不存在时会创建,当表已存在时会删除并重建
    `CREATE_SCHEMA_WHEN_NOT_EXIST`:当表不存在时会创建,当表已存在时则跳过创建
    `ERROR_WHEN_SCHEMA_NOT_EXIST`:当表不存在时将抛出错误
    `IGNORE` :忽略对表的处理
    ### 通用选项 数据源插件的通用参数,请参考 [Source Common Options](../common-options/sink-common-options.md) 了解详细信息。 ## 示例 ### 单表 ```hocon sink { Hudi { table_dfs_path = "hdfs://nameserivce/data/" database = "st" table_name = "test_table" table_type = "COPY_ON_WRITE" conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" batch_size = 10000 use.kerberos = true kerberos.principal = "test_user@xxx" kerberos.principal.file = "/home/test/test_user.keytab" } } ``` ### 多表 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Hudi { table_dfs_path = "hdfs://nameserivce/data/" conf_files_path = "/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml" table_list = [ { database = "st1" table_name = "role" table_type = "COPY_ON_WRITE" op_type="INSERT" batch_size = 10000 }, { database = "st1" table_name = "user" table_type = "COPY_ON_WRITE" op_type="UPSERT" # op_type is 'UPSERT', must configured record_key_fields record_key_fields = "user_id" batch_size = 10000 }, { database = "st1" table_name = "Bucket" table_type = "MERGE_ON_READ" } ] ... } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/HugeGraph.md ================================================ import ChangeLog from '../changelog/connector-hugegraph.md'; # HugeGraph Sink Connector `Sink: HugeGraph` ## 描述 HugeGraph sink连接器允许您将数据从SeaTunnel写入Apache HugeGraph,这是一个快速且可扩展的图数据库。 该连接器支持将数据作为顶点或边写入,提供了从关系数据模型到图结构的灵活映射。它专为高性能数据加载而设计。 ## 特性 - **批量写入**: 数据分批写入,以实现高吞吐量。 - **灵活映射**: 支持将源字段灵活映射到顶点/边属性。 - **顶点和边写入**: 可以将数据作为顶点或边写入。 - **自动创建Schema**: 如果不存在,可以自动创建图Schema元素(属性键、顶点标签、边标签)。 ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | | ------------------- | ------- | -------- | ------ | ---------------------------------------------------------------------- | | `host` | String | 是 | - | HugeGraph服务器的主机。 | | `port` | Integer | 是 | - | HugeGraph服务器的端口。 | | `graph_name` | String | 是 | - | 要写入的图的名称。 | | `graph_space` | String | 是 | - | 要操作的图的图空间。 | | `username` | String | 否 | - | 用于HugeGraph身份验证的用户名。 | | `password` | String | 否 | - | 用于HugeGraph身份验证的密码。 | | `batch_size` | Integer | 否 | 500 | 在单批次写入HugeGraph之前缓冲的记录数。 | | `batch_interval_ms` | Integer | 否 | 5000 | 刷新批次前等待的最大时间(毫秒)。 | | `max_retries` | Integer | 否 | 3 | 重试失败写入操作的最大次数。 | | `retry_backoff_ms` | Integer | 否 | 5000 | 重试之间的退避时间(毫秒)。 | ## Sink选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | | ------------------ | ------ | -------- | ------ | -------------------------------------------------------------------- | | `schema_config` | Object | 是 | - | 将输入数据映射到HugeGraph的Schema(顶点或边)的配置。 | | `selected_fields` | List | 否 | - | 要从输入数据中选择的字段列表。如果未指定,将使用所有字段。 | | `ignored_fields` | List | 否 | - | 要从输入数据中忽略的字段列表。与`selected_fields`互斥。 | ### Schema配置 (`schema_config`) `schema_config`列表中的每个对象都定义了从源数据到HugeGraph中特定顶点或边标签的映射。 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | | ------------------ | ------------------- | -------- | ------- |------------------------------------------------------------| | `type` | String | 是 | - | 要映射到的图元素的类型。必须是`VERTEX`或`EDGE`。 | | `label` | String | 是 | - | HugeGraph中顶点或边的标签。 | | `properties` | `List` | 否 | - | 顶点或边的源字段名称列表。 | | `ttl` | Long | 否 | - | 顶点或边的生存时间(秒)。 | | `ttlStartTime` | String | 否 | - | TTL的开始时间。 | | `enableLabelIndex` | Boolean | 否 | `false` | 是否为此标签启用标签索引。 | | `userdata` | `Map` | 否 | - | 与标签关联的用户定义数据。 | | `idStrategy` | String | 对于顶点 | - | 顶点的ID生成策略。支持的值:`PRIMARY_KEY`、`CUSTOMIZE_UUID`、`AUTOMATIC`。 | | `idFields` | `List` | 对于顶点 | - | 用于生成顶点ID的源字段名称列表。 | | `sourceConfig` | Object | 对于边 | - | 定义边的源顶点映射的对象。请参阅下面的`Source/Target Config`。 | | `targetConfig` | Object | 对于边 | - | 定义边的目标顶点映射的对象。请参阅下面的`Source/Target Config`。 | | `frequency` | String | 对于边 | - | 边的频率,例如`SINGLE`、`MULTIPLE`。 | | `mapping` | Object | 否 | - | 定义高级字段和值映射的对象。请参阅下面的`Mapping Config`。 | ### Source/Target配置 (`sourceConfig` 和 `targetConfig`) 此对象在`EDGE` Schema中使用,用于定义如何识别源顶点和目标顶点。 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | | ---------- | ------------ | -------- | ------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | `label` | String | 是 | - | 源或目标顶点的标签。 | | `idFields` | `List` | 是 | - | 用于构造源/目标顶点ID的输入行中的源字段名称列表。这些值将被连接起来形成顶点ID。 | ### Mapping配置 (`mapping`) 此对象提供对字段和值如何映射到属性的高级控制。 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | | ----------------- | ------------------ | -------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `fieldMapping` | `Map` | 否 | - | 一个映射,其中键是源字段名,值是HugeGraph中的目标属性名。如果未指定,则使用源字段名作为目标属性名。 | | `valueMapping` | `Map` | 否 | - | 用于转换特定字段值的映射。键是源的原始值,值是要写入的新值。 | | `nullableKeys` | `List` | 否 | - | 可以具有null值的属性键列表。 | | `nullValues` | `List` | 否 | - | 应被视为`null`的字符串值列表。任何包含这些值的字段都不会被写入。 | | `dateFormat` | String | 否 | `yyyy-MM-dd` | 用于解析日期字符串的日期格式。 | | `timeZone` | String | 否 | `GMT+8` | 用于日期解析的时区。 | | `sortKeys` | `List` | 对于边 | - | 用于对具有相同源和目标顶点的边进行排序的属性键列表。 | ## 使用示例 ### 1. 写入顶点 此示例展示了如何从`FakeSource`读取数据并将`person`顶点写入HugeGraph。顶点ID基于`name`字段。 ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_input = "fake_source" schema = { fields = { name = "string" age = "int" } } } } sink { HugeGraph { host = "localhost" port = 8080 graph_name = "hugegraph" graph_space = "default" selected_fields = ["name", "age"] schema_config = { type = "VERTEX" label = "person" idStrategy = "PRIMARY_KEY" idFields = ["name"] properties = ["name", "age"] } } } ``` ### 2. 写入边 此示例将一个关系表同步为HugeGraph中的`knows`边。源表包含相互认识的两个人的姓名以及他们相识的年份。 ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_input = "fake_source" schema = { fields = { person1_name = "string" person2_name = "string" since = "int" } } } } sink { HugeGraph { host = "localhost" port = 8080 graph_name = "hugegraph" graph_space = "default" schema_config = { type = "EDGE" label = "knows" sourceConfig = { label = "person" idFields = ["person1_name"] } targetConfig = { label = "person" idFields = ["person2_name"] } properties = ["since"] mapping = { fieldMapping = { person1_name = "name" person2_name = "name" } } } } } ``` ## Changelog ================================================ FILE: docs/zh/connectors/sink/Iceberg.md ================================================ import ChangeLog from '../changelog/connector-iceberg.md'; # Apache Iceberg > Apache Iceberg sink连接器 ## Iceberg 版本支持 - 1.6.1 ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 Apache Iceberg 目标连接器支持cdc模式、自动建表及表结构变更. ## 主要特性 - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 支持的数据源信息 | 数据源 | 依赖项 | Maven依赖 | |---------|-----------|---------------------------------------------------------------------| | Iceberg | hive-exec | [下载](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Iceberg | libfb303 | [下载](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## 数据库依赖 > 为了确保与不同版本的 Hadoop 和 Hive 兼容,项目 pom 文件中的 hive-exec 依赖范围被设置为 provided。因此,如果您使用 Flink 引擎,可能需要将以下 Jar 包添加到 /lib 目录中;如果您使用的是 Spark 引擎并且已经集成了 Hadoop,则无需添加以下 Jar 包。 ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > 某些版本的 hive-exec 包中不包含 libfb303-xxx.jar,因此您还需要手动导入该 Jar 包。 ## 数据类型映射 | SeaTunnel 数据类型 | Iceberg 数据类型 | |----------------|------------------| | BOOLEAN | BOOLEAN | | INT | INTEGER | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | STRING | | BYTES | FIXED
    BINARY | | DECIMAL | DECIMAL | | ROW | STRUCT | | ARRAY | LIST | | MAP | MAP | ## Sink 选项 | 名称 | 类型 | 是否必须 | 默认 | 描述 | |----------------------------------------|---------|------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | yes | default | 用户指定的目录名称,默认为`default` | | namespace | string | yes | default | backend catalog(元数据存储的后端目录)中 Iceberg 数据库的名称,默认为 `default` | | table | string | yes | - | backend catalog(元数据存储的后端目录)中 Iceberg 表的名称 | | iceberg.catalog.config | map | yes | - | 用于指定初始化 Iceberg Catalog 的属性,这些属性可以参考此文件:[CatalogProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java) | | hadoop.config | map | no | - | 传递给 Hadoop 配置的属性 | | iceberg.hadoop-conf-path | string | no | - | 指定`core-site.xml`、`hdfs-site.xml`、`hive-site.xml` 文件的加载路径 | | case_sensitive | boolean | no | false | 列名匹配时是否区分大小写 | | iceberg.table.write-props | map | no | - | 传递给 Iceberg 写入器初始化的属性,这些属性具有最高优先级,例如 `write.format.default`、`write.target-file-size-bytes` 等设置。具体参数可以参考:[TableProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/TableProperties.java)。 | | iceberg.table.auto-create-props | map | no | - | Iceberg 自动建表时指定的配置 | | iceberg.table.schema-evolution-enabled | boolean | no | false | 设置为 true 时,Iceberg 表可以在同步过程中支持 schema 变更 | | iceberg.table.primary-keys | string | no | - | 用于标识表中一行数据的主键列列表,默认情况下以逗号分隔 | | iceberg.table.partition-keys | string | no | - | 创建表时使用的分区字段列表,默认情况下以逗号分隔。多表场景可使用占位符 `${partition_keys}` | | iceberg.table.upsert-mode-enabled | boolean | no | false | 设置为 `true` 以启用 upsert 模式,默认值为 `false` | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | schema 变更方式, 请参考下面的 `schema_save_mode` | | data_save_mode | Enum | no | APPEND_DATA | 数据写入方式, 请参考下面的 `data_save_mode` | | custom_sql | string | no | - | 自定义 `delete` 数据的 SQL 语句,用于数据写入方式。例如: `delete from ... where ...` | | iceberg.table.commit-branch | string | no | - | 提交的默认分支 | ## 任务示例 ### 简单示例 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { plugin_output = "customers_mysql_cdc_iceberg" server-id = 5652 username = "st_user" password = "seatunnel" table-names = ["mysql_cdc.mysql_cdc_e2e_source_table"] url = "jdbc:mysql://mysql_cdc_e2e:3306/mysql_cdc" } } transform { } sink { Iceberg { catalog_name="seatunnel_test" iceberg.catalog.config={ "type"="hadoop" "warehouse"="file:///tmp/seatunnel/iceberg/hadoop-sink/" } namespace="seatunnel_namespace" table="iceberg_sink_table" iceberg.table.write-props={ write.format.default="parquet" write.target-file-size-bytes=536870912 } iceberg.table.primary-keys="id" iceberg.table.partition-keys="f_datetime" iceberg.table.upsert-mode-enabled=true iceberg.table.schema-evolution-enabled=true case_sensitive=true } } ``` ### Hive Catalog ```hocon sink { Iceberg { catalog_name="seatunnel_test" iceberg.catalog.config={ type = "hive" uri = "thrift://localhost:9083" warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" } namespace="seatunnel_namespace" table="iceberg_sink_table" iceberg.table.write-props={ write.format.default="parquet" write.target-file-size-bytes=536870912 } iceberg.table.primary-keys="id" iceberg.table.partition-keys="f_datetime" iceberg.table.upsert-mode-enabled=true iceberg.table.schema-evolution-enabled=true case_sensitive=true } } ``` ### Hadoop catalog ```hocon sink { Iceberg { catalog_name="seatunnel_test" iceberg.catalog.config={ type = "hadoop" warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" } namespace="seatunnel_namespace" table="iceberg_sink_table" iceberg.table.write-props={ write.format.default="parquet" write.target-file-size-bytes=536870912 } iceberg.table.primary-keys="id" iceberg.table.partition-keys="f_datetime" iceberg.table.upsert-mode-enabled=true iceberg.table.schema-evolution-enabled=true case_sensitive=true } } ``` ### AWS S3 Tables REST Catalog Amazon S3 表类数据存储服务提供针对分析工作负载进行优化的 S3 存储,其功能旨在持续提高查询性能并降低表的存储成本。S3 表类数据存储服务专为存储表数据而设计,例如每日购买交易、流传感器数据或广告展示次数。表数据以列和行表示数据,就像在数据库表中一样。 您可以将 Iceberg REST 客户端连接到 Amazon S3 表类数据存储服务 Iceberg REST 端点,然后进行 REST API 调用来创建、更新或查询 S3 表存储桶中的表。该端点实现了 Apache Iceberg REST Catalog Open API specification 中指定的一组标准化 Iceberg REST API。该端点的工作原理是将 Iceberg REST API 操作转换为相应的 S3 表类数据存储服务操作。 S3 表类数据存储服务中的数据存储在新的存储桶类型中:表存储桶,它将表存储为子资源。表存储桶支持以 Apache Iceberg 格式存储表。使用标准 SQL 语句,您可以通过支持 Iceberg 的查询引擎来查询表,例如 Amazon Athena、Amazon Redshift 和 Apache Spark。 ```hocon sink { Iceberg { catalog_name = "s3_tables_catalog" namespace = "s3_tables_catalog" table = "user_data" iceberg.catalog.config = { type: "rest" warehouse: "arn:aws:s3tables:::bucket/" uri: "https://s3tables..amazonaws.com/iceberg" rest.sigv4-enabled: "true" rest.signing-name: "s3tables" rest.signing-region: "" } } } ``` ### Multiple table(多表写入) #### 示例1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Iceberg { ... namespace = "${database_name}_test" table = "${table_name}_test" } } ``` #### 示例2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Iceberg { ... namespace = "${schema_name}_test" table = "${table_name}_test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/InfluxDB.md ================================================ import ChangeLog from '../changelog/connector-influxdb.md'; # InfluxDB > InfluxDB Sink 连接器 ## 描述 将数据写入 InfluxDB。 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | |-----------------------------|--------|------|------------------------------| | url | string | 是 | - | | database | string | 是 | | | measurement | string | 是 | | | username | string | 否 | - | | password | string | 否 | - | | key_time | string | 否 | processing time | | key_tags | array | 否 | exclude `field` & `key_time` | | batch_size | int | 否 | 1024 | | max_retries | int | 否 | - | | retry_backoff_multiplier_ms | int | 否 | - | | connect_timeout_ms | long | 否 | 15000 | | common-options | config | 否 | - | ### url 连接到 influxDB 的 url,例如 ``` http://influxdb-host:8086 ``` ### database [string] `influxDB` 数据库的名称 ### measurement [string] `influxDB` measurement 的名称 ### username [string] `influxDB` 用户名 ### password [string] `influxDB` 用户密码 ### key_time [string] 在 SeaTunnelRow 中指定 `influxDB` measurement 时间戳的字段名。如果未指定,则使用处理时间作为时间戳 ### key_tags [array] 在 SeaTunnelRow 中指定 `influxDB` measurement 标签的字段名。 如果未指定,则包含所有字段作为 `influxDB` measurement 字段 ### batch_size [int] 对于批量写入,当缓冲区数量达到 `batch_size` 数量或时间达到 `checkpoint.interval` 时,数据将被刷新到 influxDB ### max_retries [int] 刷新失败的重试次数 ### retry_backoff_multiplier_ms [int] 用作生成下一个退避延迟的乘数 ### max_retry_backoff_ms [int] 在尝试重新请求 `influxDB` 之前等待的时间量 ### connect_timeout_ms [long] 连接到 InfluxDB 的超时时间,以毫秒为单位 ### 通用选项 Sink 插件通用参数,请参考 [Sink 通用选项](../common-options/sink-common-options.md) 详见 ## 示例 ```hocon sink { InfluxDB { url = "http://influxdb-host:8086" database = "test" measurement = "sink" key_time = "time" key_tags = ["label"] batch_size = 1 } } ``` ### 多表 #### 示例1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { InfluxDB { url = "http://influxdb-host:8086" database = "test" measurement = "${table_name}_test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/IoTDB.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB数据接收器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 用于将数据写入 IoTDB。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) > IoTDB 通过幂等写支持`精确一次`功能。如果两条数据使用相同的`key`和`timestamp`,新数据将覆盖旧数据。 ## 支持的数据源信息 | 数据源 | Supported 版本 | 地址 | |-------|------------------------------|----------------| | IoTDB | `0.13.0 <= version <= 1.3.X` | localhost:6667 | ## 数据类型映射 | IoTDB 数据类型 | SeaTunnel 数据类型 | |------------|----------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | ## Sink 选项 | 名称 | 类型 | 是否必传 | 默认值 | 描述 | |-----------------------------|---------|------|--------------------------------|------------------------------------------------------------------------------| | node_urls | Array | 是 | - | IoTDB 集群地址,格式为 `["host1:port"]` 或 `["host1:port","host2:port"]` | | username | String | 是 | - | IoTDB 用户的用户名 | | password | String | 是 | - | IoTDB 用户的密码 | | key_device | String | 是 | - | 在SeaTunnelRow中指定 IoTDB 设备ID的字段名 | | key_timestamp | String | 否 | processing time | 在SeaTunnelRow中指定 IoTDB 时间戳的字段名。如果未指定,则使用处理时间作为时间戳 | | key_measurement_fields | Array | 否 | exclude `device` & `timestamp` | 在SeaTunnelRow中指定 IoTDB 测量列表的字段名称。如果未指定,则包括所有字段,但排除 `device` & `timestamp` | | storage_group | Array | 否 | - | 指定设备存储组(路径前缀)
    例如: deviceId = \${storage_group} + "." + \${key_device} | | batch_size | Integer | 否 | 1024 | 对于批写入,当缓冲区的数量达到`batch_size`的数量或时间达到`batch_interval_ms`时,数据将被刷新到IoTDB中 | | max_retries | Integer | 否 | - | 刷新的重试次数 failed | | retry_backoff_multiplier_ms | Integer | 否 | - | 用作生成下一个退避延迟的乘数 | | max_retry_backoff_ms | Integer | 否 | - | 尝试重试对 IoTDB 的请求之前等待的时间量 | | default_thrift_buffer_size | Integer | 否 | - | 在 IoTDB 客户端中节省初始化缓冲区大小 | | max_thrift_frame_size | Integer | 否 | - | 在 IoTDB 客户端中节约最大帧大小 | | zone_id | string | 否 | - | IoTDB java.time.ZoneId client | | enable_rpc_compression | Boolean | 否 | - | 在 IoTDB 客户端中启用rpc压缩 | | connection_timeout_in_ms | Integer | 否 | - | 连接到 IoTDB 时等待的最长时间(毫秒) | | common-options | | 否 | - | Sink 插件常用参数,详见 [Sink common Options](../Sink common Options.md) | ## 示例 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 16 bigint.template = [1664035200001] schema = { fields { device_name = "string" temperature = "float" moisture = "int" event_ts = "bigint" c_string = "string" c_boolean = "boolean" c_tinyint = "tinyint" c_smallint = "smallint" c_int = "int" c_bigint = "bigint" c_float = "float" c_double = "double" } } } } ``` 上游SeaTunnelRow数据格式如下: | device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double | |--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------| | root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 | | root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 | | root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 | ### 案例1 只填写所需的配置: - 使用当前处理时间作为时间戳 - 测点包括排除了`key_device`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` ### 案例2 使用源事件的时间: - 使用指定字段作为时间戳 - 测点包括排除了`key_device`和`key_timestamp`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field key_timestamp = "event_ts" # specify the `timestamp` use event_ts field } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` ### 案例3 使用源事件的时间和限制测量字段: - 使用指定字段作为时间戳 - 测点仅包括`key_measurement_fields`指定的字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" key_timestamp = "event_ts" key_measurement_fields = ["temperature", "moisture"] } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+ | Time| Device| temperature| moisture| +------------------------+------------------------+--------------+-----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| +------------------------+------------------------+--------------+-----------+ ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/IoTDBv2.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB 数据接收器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 用于将数据写入 IoTDB。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) > IoTDB 通过幂等写支持`精确一次`功能。如果两条数据使用相同的`key`和`timestamp`,新数据将覆盖旧数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 地址 | |-------|------------------|----------------| | IoTDB | `2.0 <= version` | localhost:6667 | ## 数据类型映射 | SeaTunnel 数据类型 | IoTDB 数据类型 | |----------------|------------| | BOOLEAN | BOOLEAN | | TINYINT | INT32 | | SMALLINT | INT32 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | TIMESTAMP | TIMESTAMP | | DATE | DATE | ## Sink 选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |-----------------------------|---------|------|--------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | node_urls | Array | 是 | - | IoTDB 集群地址,格式为 `["host1:port"]` 或 `["host1:port","host2:port"]` | | username | String | 是 | - | IoTDB 用户名 | | password | String | 是 | - | IoTDB 用户密码 | | sql_dialect | String | 否 | tree | IoTDB 模型,tree:树模型;table:表模型 | | storage_group | String | 是 | - | IoTDB 树模型:指定设备存储组(路径前缀)
    例如: deviceId = \${storage_group} + "." + \${key_device}
    IoTDB 表模型:指定数据库 | | key_device | String | 是 | - | IoTDB 树模型:在 SeaTunnelRow 中指定 IoTDB 设备 ID 的字段名;
    IoTDB 表模型:在 SeaTunnelRow 中指定 IoTDB 表名的字段名 | | key_timestamp | String | 否 | 数据处理时间 | IoTDB 树模型:在 SeaTunnelRow 中指定 IoTDB 时间戳的字段名(如未指定,则使用处理时间作为时间戳);
    IoTDB 表模型:在 SeaTunnelRow 中指定 IoTDB 时间列的字段名(如未指定,则使用处理时间作为时间戳) | | key_measurement_fields | Array | 否 | 见描述 | IoTDB 树模型:在 SeaTunnelRow 中指定 IoTDB 测量列表的字段名(如未指定,则包括排除`key_device`&`key_timestamp`后的其余字段);
    IoTDB 表模型:在 SeaTunnelRow 中指定 IoTDB 测点列(FIELD)的字段名(如未指定,则包括排除`key_device`&`key_timestamp`&`key_tag_fields`&`key_attribute_fields`后的其余字段) | | key_tag_fields | Array | 否 | - | IoTDB 树模型:不生效;
    IoTDB 表模型:在 SeaTunnelRow 中指定 IoTDB 标签列(TAG)的字段名 | | key_attribute_fields | Array | 否 | - | IoTDB 树模型:不生效;
    IoTDB 表模型:在 SeaTunnelRow 中指定 IoTDB 属性列(ATTRIBUTE)的字段名 | | batch_size | Integer | 否 | 1024 | 对于批写入,当缓冲区的数量达到`batch_size`的数量或时间达到`batch_interval_ms`时,数据将被刷新到 IoTDB 中 | | max_retries | Integer | 否 | - | 刷新的重试次数 | | retry_backoff_multiplier_ms | Integer | 否 | - | 用作生成下一个退避延迟的乘数 | | max_retry_backoff_ms | Integer | 否 | - | 尝试重试对 IoTDB 的请求之前等待的时间量 | | default_thrift_buffer_size | Integer | 否 | - | 在 IoTDB 客户端中节省初始化缓冲区大小 | | max_thrift_frame_size | Integer | 否 | - | 在 IoTDB 客户端中节约最大帧大小 | | zone_id | String | 否 | - | IoTDB java.time.ZoneId client | | enable_rpc_compression | Boolean | 否 | - | 在 IoTDB 客户端中启用 rpc 压缩,只在树模型中生效 | | connection_timeout_in_ms | Integer | 否 | - | 连接到 IoTDB 时等待的最长时间(毫秒) | | common-options | | 否 | - | Sink 插件常用参数,详见 [Sink common Options](../Sink common Options.md) | ## 示例 ### 示例 1: 写入 IoTDB 树模型数据 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { row.num = 16 bigint.template = [1664035200001] schema = { fields { device_name = "string" temperature = "float" moisture = "int" event_ts = "bigint" c_string = "string" c_boolean = "boolean" c_tinyint = "tinyint" c_smallint = "smallint" c_int = "int" c_bigint = "bigint" c_float = "float" c_double = "double" } } } } ``` 上游 SeaTunnelRow 数据格式如下: | device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double | |--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------| | root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 | | root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 | | root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 | #### 案例 1 只填写所需的配置: - 使用当前处理时间作为时间戳 - 测点包括排除了`key_device`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` #### 案例 2 使用源事件的时间: - 使用指定字段作为时间戳 - 测点包括排除了`key_device`和`key_timestamp`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" # specify the `deviceId` use device_name field key_timestamp = "event_ts" # specify the `timestamp` use event_ts field } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ | Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double| +------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0| +------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+ ``` #### 案例 3 使用源事件的时间和限制测量字段: - 使用指定字段作为时间戳 - 测点仅包括`key_measurement_fields`指定的字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" key_device = "device_name" key_timestamp = "event_ts" key_measurement_fields = ["temperature", "moisture"] } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM root.test_group.* align by device; +------------------------+------------------------+--------------+-----------+ | Time| Device| temperature| moisture| +------------------------+------------------------+--------------+-----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| +------------------------+------------------------+--------------+-----------+ ``` ### 示例 2: 写入 IoTDB 表模型数据 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { FakeSource { ... schema = { fields { ts = timestamp model_id = string region = string tag = string status = boolean arrival_date = date temperature = double } } } } ``` 上游 SeaTunnelRow 数据格式如下: | ts | model_id | region | tag | status | arrival_date | temperature | |-------------------------|----------|--------|------|--------|--------------|-------------| | 2025-07-30T17:52:34.851 | id1 | 0700HK | tag1 | true | 2024-11-12 | 4.34 | | 2025-07-29T17:51:34.851 | id2 | 0700HK | tag2 | false | 2024-12-01 | 5.54 | | 2025-07-28T17:50:34.851 | id3 | 0700HK | tag3 | false | 2024-12-22 | 7.34 | #### 案例 1 只填写所需的配置: - 使用当前处理时间作为时间列 - 测量列(FIELD)包括排除了`key_device`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+-----------------------+--------+----+------+------------+-----------+ | time| ts|model_id| tag|status|arrival_date|temperature| +-----------------------------+-----------------------+--------+----+------+------------+-----------+ |2025-08-14T17:52:34.851+08:00|2025-07-30T17:52:34.851| id1|tag1| true| 2024-11-12| 4.34| |2025-08-14T17:51:34.851+08:00|2025-07-29T17:51:34.851| id2|tag2| false| 2024-12-01| 5.54| |2025-08-14T17:50:34.851+08:00|2025-07-28T17:50:34.851| id3|tag3| false| 2024-12-22| 7.34| +-----------------------------+-----------------------+--------+----+------+------------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +------------+---------+--------+ | ColumnName| DataType|Category| +------------+---------+--------+ | time|TIMESTAMP| TIME| | ts|TIMESTAMP| FIELD| | model_id| STRING| FIELD| | tag| STRING| FIELD| | status| BOOLEAN| FIELD| |arrival_date| DATE| FIELD| | temperature| DOUBLE| FIELD| +------------+---------+--------+ ``` #### 案例 2 使用源事件的时间和限制标签列及属性列: - 使用指定字段作为时间列 - 使用指定字段作为标签列(TAG)及属性列(ATTRIBUTE) - 测量列(FIELD)包括排除了`key_device`、`key_timestamp`、`key_tag_fields`和`key_attribute_fields`后的其余字段 ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" key_timestamp = "ts" key_tag_fields = ["tag"] key_attribute_fields = ["model_id"] } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+----+--------+------+------------+-----------+ | time| tag|model_id|status|arrival_date|temperature| +-----------------------------+----+--------+------+------------+-----------+ |2025-07-30T17:52:34.851+08:00|tag1| id1| true| 2024-11-12| 4.34| |2025-07-29T17:51:34.851+08:00|tag2| id2| false| 2024-12-01| 5.54| |2025-07-28T17:50:34.851+08:00|tag3| id3| false| 2024-12-22| 7.34| +-----------------------------+----+--------+------+------------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +------------+---------+---------+ | ColumnName| DataType| Category| +------------+---------+---------+ | time|TIMESTAMP| TIME| | tag| STRING| TAG| | model_id| STRING|ATTRIBUTE| | status| BOOLEAN| FIELD| |arrival_date| DATE| FIELD| | temperature| DOUBLE| FIELD| +------------+---------+---------+ ``` #### 案例 3 使用源事件的时间和限制测量列: - 使用指定字段作为时间列 - 使用指定字段作为测点列(FIELD) ```hocon sink { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" storage_group = "test_database" key_device = "region" key_timestamp = "ts" key_measurement_fields = ["status", "temperature"] } } ``` IoTDB 数据格式的输出如下: ```shell IoTDB> SELECT * FROM "test_database"."0700HK"; +-----------------------------+------+-----------+ | time|status|temperature| +-----------------------------+------+-----------+ |2025-07-30T17:52:34.851+08:00| true| 4.34| |2025-07-29T17:51:34.851+08:00| false| 5.54| |2025-07-28T17:50:34.851+08:00| false| 7.34| +-----------------------------+------+-----------+ ``` ```shell IoTDB> DESC "test_database"."0700HK"; +-----------+---------+--------+ | ColumnName| DataType|Category| +-----------+---------+--------+ | time|TIMESTAMP| TIME| | status| BOOLEAN| FIELD| |temperature| DOUBLE| FIELD| +-----------+---------+-------+ ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Jdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # JDBC > JDBC 数据接收器 ## 描述 通过jdbc写入数据。支持批处理模式和流处理模式,支持并发写入,支持精确一次语义(使用XA事务保证) ## 使用依赖 ### 用于Spark/Flink引擎 > 1. 需要确保jdbc驱动jar包已经放在目录`${SEATUNNEL_HOME}/plugins/`下。 ### 适用于 SeaTunnel Zeta 引擎 > 1. 需要确保jdbc驱动jar包已经放到`${SEATUNNEL_HOME}/lib/`目录下。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 使用 `Xa transactions` 来确保 `exactly-once`。所以仅对于支持 `Xa transactions` 的数据库支持 `exactly-once` 。你可以设置 `is_exactly_once=true` 来启用它。 - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## Options | 名称 | 类型 | 是否必须 | 默认值 | |-------------------------------------------|---------|------|------------------------------| | url | String | 是 | - | | driver | String | 是 | - | | user | String | 否 | - | | password | String | 否 | - | | query | String | 否 | - | | compatible_mode | String | 否 | - | | dialect | String | 否 | - | | database | String | 否 | - | | table | String | 否 | - | | primary_keys | Array | 否 | - | | connection_check_timeout_sec | Int | 否 | 30 | | max_retries | Int | 否 | 0 | | batch_size | Int | 否 | 1000 | | is_exactly_once | Boolean | 否 | false | | generate_sink_sql | Boolean | 否 | false | | xa_data_source_class_name | String | 否 | - | | max_commit_attempts | Int | 否 | 3 | | transaction_timeout_sec | Int | 否 | -1 | | auto_commit | Boolean | 否 | true | | field_ide | String | 否 | - | | properties | Map | 否 | - | | common-options | | 否 | - | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | Enum | 否 | APPEND_DATA | | custom_sql | String | 否 | - | | enable_upsert | Boolean | 否 | true | | use_copy_statement | Boolean | 否 | false | | access_key_id | String | 否 | | | secret_access_key | String | 否 | | | region | String | 否 | | ### driver [string] 用于连接远程数据源的 jdbc 类名,如果使用MySQL,则值为`com.mysql.cj.jdbc.Driver` ### user [string] 用户名 ### password [string] 密码 ### url [string] JDBC 连接的 URL。参考案例:`jdbc:postgresql://localhost/test` ### query [string] 使用 sql 语句将上游输入数据写入到数据库。如 `INSERT ...` ### compatible_mode [string] 数据库的兼容模式,当数据库支持多种兼容模式时需要。 例如,使用 OceanBase 数据库时,需要将其设置为 'mysql' 或 'oracle' 。使用StarRocks时,需要将其设置为`starrocks`。 Postgres 9.5及以下版本,请设置为 `postgresLow` 来支持 CDC ### dialect [string] 指定的方言,如果不存在,仍然按照url获取,优先级高于url。例如,当使用 starrocks 时,你需要将其值设置为 starrocks,同理,当使用mysql时,你需要将其值设置为mysql。 如果 SeaTunnel 不支持某种方言,它将使用默认方言 `GenericDialect`。请确保您提供的驱动程序支持您想要连接的数据库。 #### 示例可选 | | 方言名称 | | |-----------|------------|----------| | Greenplum | DB2 | Dameng | | Gbase8a | HIVE | KingBase | | MySQL | StarRocks | Oracle | | Phoenix | Postgres | Redshift | | SapHana | Snowflake | Sqlite | | SqlServer | Tablestore | Teradata | | Vertica | OceanBase | XUGU | | IRIS | Inceptor | Highgo | | DSQL | | | ### database [string] 使用此 `database` 和 `table-name` 自动生成 SQL,并接收上游输入的数据写入数据库。 此选项与 `query` 选项是互斥的,此选项具有更高的优先级。 ### table [string] 使用 `database` 和此 `table-name` 自动生成 SQL,并接收上游输入的数据写入数据库。 此选项与 `query` 选项是互斥的,此选项具有更高的优先级。 table参数可以填入一个任意的表名,这个名字最终会被用作创建表的表名,并且支持变量(`${table_name}`,`${schema_name}`)。 替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${table_name}` 将替换传递给目标端的表名。 mysql 接收器示例: 1. test_${schema_name}_${table_name}_test 2. sink_sinktable 3. ss_${table_name} pgsql (Oracle Sqlserver ...) 接收器示例: 1. ${schema_name}.${table_name}_test 2. dbo.tt_${table_name}_sink 3. public.sink_table Tip: 如果目标数据库有 SCHEMA 的概念,则表参数必须写成 `xxx.xxx` ### primary_keys [array] 该选项用于辅助生成 insert、delete、update 等 sql 语句。设置了该选项,将会根据该选项生成对应的 sql 语句 ### connection_check_timeout_sec [int] 用于验证数据库连接的有效性时等待数据库操作完成所需的时间,单位是秒 ### max_retries [int] 重试提交失败的最大次数(executeBatch) ### batch_size [int] 对于批量写入,当缓冲的记录数达到 `batch_size` 数量或者时间达到 `checkpoint.interval` 时,数据将被刷新到数据库中 ### is_exactly_once [boolean] 是否启用通过XA事务实现的精确一次语义。开启,你还需要设置 `xa_data_source_class_name` ### generate_sink_sql [boolean] 根据要写入的数据库表结构生成 sql 语句 ### xa_data_source_class_name [string] 指数据库驱动的 XA 数据源的类名。以 MySQL 为例,其类名为 com.mysql.cj.jdbc.MysqlXADataSource。了解其他数据库的数据源类名,可以参考文档的附录部分 ### max_commit_attempts [int] 事务提交失败的最大重试次数 ### transaction_timeout_sec [int] 在事务开启后的超时时间,默认值为-1(即永不超时)。请注意,设置超时时间可能会影响到精确一次(exactly-once)的语义 ### auto_commit [boolean] 默认启用自动事务提交 ### field_ide [String] 字段 `field_ide` 用于在从 source 同步到 sink 时,确定字段是否需要转换为大写或小写。'ORIGINAL' 表示不需要转换,'UPPERCASE' 表示转换为大写,'LOWERCASE' 表示转换为小写 ### properties 附加连接配置参数,当属性和URL具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 MySQL 中,属性配置优先于 URL。 ### common options Sink插件常用参数,请参考 [Sink常用选项](../common-options/sink-common-options.md) 了解详情 ### schema_save_mode [Enum] 在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
    选项介绍:
    `RECREATE_SCHEMA`:当表不存在时会创建,当表已存在时会删除并重建
    `CREATE_SCHEMA_WHEN_NOT_EXIST`:当表不存在时会创建,当表已存在时则跳过创建
    `ERROR_WHEN_SCHEMA_NOT_EXIST`:当表不存在时将抛出错误
    `IGNORE` :忽略对表的处理
    ### data_save_mode [Enum] 在启动同步任务之前,针对目标侧已存在的数据选择不同的处理方案
    选项介绍:
    `DROP_DATA`:保留数据库结构,删除数据
    `APPEND_DATA`:保留数据库结构,保留数据
    `CUSTOM_PROCESSING`:允许用户自定义数据处理方式
    `ERROR_WHEN_DATA_EXISTS`:当有数据时抛出错误
    ### custom_sql [String] 当`data_save_mode`选择`CUSTOM_PROCESSING`时,需要填写`CUSTOM_SQL`参数。该参数通常填写一条可以执行的SQL。SQL将在同步任务之前执行 ### enable_upsert [boolean] 启用通过主键更新插入,如果任务没有key重复数据,设置该参数为 false 可以加快数据导入速度 ### use_copy_statement [boolean] 使用 `COPY ${table} FROM STDIN` 语句导入数据。仅支持具有 `getCopyAPI()` 方法连接的驱动程序。例如:Postgresql 驱动程序 `org.postgresql.Driver` 注意:不支持 `MAP`、`ARRAY`、`ROW`类型 ### access_key_id [String] AWS IAM 认证中所需要的access_key_id 。 该参考仅适用于 dialect="dsql" ### secret_access_key [String] AWS IAM 认证中所需要的secret_access_key。 该参考仅适用于 dialect="dsql" ### region [String] Amazon Aurora DSQL 所在的区域。 该参考仅适用于 dialect="dsql" ## tips 在 is_exactly_once = "true" 的情况下,使用 XA 事务。这需要数据库支持,有些数据库需要一些设置:
    1 postgres 需要设置 `max_prepared_transactions > 1` 例如 `ALTER SYSTEM set max_prepared_transactions to 10`
    2 mysql 版本需要 >= `8.0.29` 并且非 root 用户需要授予 `XA_RECOVER_ADMIN` 权限。例如:将 test_db.* 上的 XA_RECOVER_ADMIN 授予 `'user1'@'%'`
    3 mysql可以尝试在url中添加 `rewriteBatchedStatements=true` 参数以获得更好的性能
    ## 附录 附录参数仅提供参考 | 数据源 | driver | url | xa_data_source_class_name | maven | |------------|----------------------------------------------|--------------------------------------------------------------------|----------------------------------------------------|----------------------------------------------------------------------------------------------------| | MySQL | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | com.mysql.cj.jdbc.MysqlXADataSource | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | PostgreSQL | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | | DM | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | dm.jdbc.driver.DmdbXADataSource | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | | Phoenix | org.apache.phoenix.queryserver.client.Driver | jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF | / | https://mvnrepository.com/artifact/com.aliyun.phoenix/ali-phoenix-shaded-thin-client | | SQL Server | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | com.microsoft.sqlserver.jdbc.SQLServerXADataSource | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | | Oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | oracle.jdbc.xa.OracleXADataSource | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | | sqlite | org.sqlite.JDBC | jdbc:sqlite:test.db | / | https://mvnrepository.com/artifact/org.xerial/sqlite-jdbc | | GBase8a | com.gbase.jdbc.Driver | jdbc:gbase://e2e_gbase8aDb:5258/test | / | https://cdn.gbase.cn/products/30/p5CiVwXBKQYIUGN8ecHvk/gbase-connector-java-9.5.0.7-build1-bin.jar | | StarRocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | db2 | com.ibm.db2.jcc.DB2Driver | jdbc:db2://localhost:50000/testdb | com.ibm.db2.jcc.DB2XADataSource | https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc/db2jcc4 | | saphana | com.sap.db.jdbc.Driver | jdbc:sap://localhost:39015 | / | https://mvnrepository.com/artifact/com.sap.cloud.db.jdbc/ngdbc | | Doris | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | / | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | teradata | com.teradata.jdbc.TeraDriver | jdbc:teradata://localhost/DBS_PORT=1025,DATABASE=test | / | https://mvnrepository.com/artifact/com.teradata.jdbc/terajdbc | | Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | | Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | | Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | | Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | | OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar | | opengauss | org.opengauss.Driver | jdbc:opengauss://localhost:5432/postgres | / | https://repo1.maven.org/maven2/org/opengauss/opengauss-jdbc/5.1.0-og/opengauss-jdbc-5.1.0-og.jar | | Highgo | com.highgo.jdbc.Driver | jdbc:highgo://localhost:5866/highgo | / | https://repo1.maven.org/maven2/com/highgo/HgdbJdbc/6.2.3/HgdbJdbc-6.2.3.jar | | Dsql | org.postgresql.Driver | jdbc:postgresql://Amazon Aurora DSQL Cluster Endpoint:5432/postgres | org.postgresql.xa.PGXADataSource | https://mvnrepository.com/artifact/org.postgresql/postgresql | ## 示例 简单示例 ``` jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } ``` 精确一次 (Exactly-once) 通过设置 `is_exactly_once` 开启精确一次语义 ``` jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" max_retries = 0 user = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } ``` 变更数据捕获 (Change data capture) 事件 jdbc 接收 CDC 示例 ``` sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "sink_database" table = "sink_table" primary_keys = ["key1", "key2", ...] } } ``` 配置表生成策略 通过设置 `schema_save_mode` 配置为 `CREATE_SCHEMA_WHEN_NOT_EXIST` 来支持不存在表时创建表 ``` sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "sink_database" table = "sink_table" primary_keys = ["key1", "key2", ...] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` 支持Postgres 9.5及以下版本的 CDC 示例 Postgres 9.5及以下版本,通过设置 `compatible_mode` 配置为 `postgresLow` 来支持 Postgres CDC 操作 ``` sink { jdbc { url = "jdbc:postgresql://localhost:5432" driver = "org.postgresql.Driver" user = "root" password = "123456" compatible_mode="postgresLow" database = "sink_database" table = "sink_table" generate_sink_sql = true primary_keys = ["key1", "key2", ...] } } ``` #### Dsql 示例 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Jdbc { dialect="Dsql" driver = "org.postgresql.Driver" url="jdbc:postgresql://ixxxxxxxxxxxxx.dsql.us-east-1.on.aws:5432/postgres" username = "admin" access_key_id = "ACCESSKEYIDEXAMPLE" secret_access_key = "SECRETACCESSKEYEXAMPLE" region = "us-east-1" database = "postgres" generate_sink_sql = true primary_keys = ["id"] max_retries = 3 batch_size = 1000 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Kafka.md ================================================ import ChangeLog from '../changelog/connector-kafka.md'; # Kafka > Kafka 数据接收器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > 默认情况下,我们将使用 2pc 来保证消息只发送一次到kafka ## 描述 将 Rows 内容发送到 Kafka topic ## 支持的数据源信息 为了使用 Kafka 连接器,需要以下依赖项 可以通过 install-plugin.sh 或从 Maven 中央存储库下载 | 数据源 | 支持版本 | Maven | |-------|------|-------------------------------------------------------------------------------| | Kafka | 通用 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-kafka) | ## 接收器选项 | 名称 | 类型 | 是否需要 | 默认值 | 描述 | |----------------------|--------|------|------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | String | 是 | - | 当表用作接收器时,topic 名称是要写入数据的 topic | | bootstrap.servers | String | 是 | - | Kafka brokers 使用逗号分隔 | | kafka.config | Map | 否 | - | 除了上述 Kafka Producer 客户端必须指定的参数外,用户还可以为 Producer 客户端指定多个非强制参数,涵盖 [Kafka官方文档中指定的所有生产者参数](https://kafka.apache.org/documentation.html#producerconfigs) | | semantics | String | 否 | NON | 可以选择的语义是 EXACTLY_ONCE/AT_LEAST_ONCE/NON,默认 NON。 | | partition_key_fields | Array | 否 | - | 配置字段用作 kafka 消息的key | | kafka_headers_fields | Array | 否 | - | 配置字段用作 kafka 消息的headers。字段值将被转换为字符串并用作 header 值 | | partition | Int | 否 | - | 可以指定分区,所有消息都会发送到此分区 | | assign_partitions | Array | 否 | - | 可以根据消息的内容决定发送哪个分区,该参数的作用是分发信息 | | transaction_prefix | String | 否 | - | 如果语义指定为EXACTLY_ONCE,生产者将把所有消息写入一个 Kafka 事务中,kafka 通过不同的 transactionId 来区分不同的事务。该参数是kafka transactionId的前缀,确保不同的作业使用不同的前缀 | | format | String | 否 | json | 数据格式。默认格式是json。可选文本格式,canal-json、debezium-json 、 avro 、 protobuf 和native。如果使用 json 或文本格式。默认字段分隔符是`,`。如果自定义分隔符,请添加`field_delimiter`选项。如果使用canal格式,请参考[canal-json](../formats/canal-json.md)。如果使用debezium格式,请参阅 [debezium-json](../formats/debezium-json.md) 了解详细信息 | | field_delimiter | String | 否 | , | 自定义数据格式的字段分隔符 | | common-options | | 否 | - | Sink插件常用参数,请参考 [Sink常用选项 ](../common-options/sink-common-options.md) 了解详情 | |protobuf_message_name|String|否|-| format配置为protobuf时生效,取Message名称 | |protobuf_schema|String|否|-| format配置为protobuf时生效取Schema名称 | ## 参数解释 ### Topic 格式 目前支持两种格式: 1. 填写topic名称 2. 使用上游数据中的字段值作为 topic ,格式是 `${your field name}`, 其中 topic 是上游数据的其中一列的值 例如,上游数据如下: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | 如果 `${name}` 设置为 topic。因此,第一行发送到 Jack topic,第二行发送到 Mary topic。 ### 语义 在 EXACTLY_ONCE 中,生产者将在 Kafka 事务中写入所有消息,这些消息将在检查点上提交给 Kafka,该模式下能保证数据精确写入kafka一次,即使任务失败重试也不会出现数据重复和丢失 在 AT_LEAST_ONCE 中,生产者将等待 Kafka 缓冲区中所有未完成的消息在检查点上被 Kafka 生产者确认,该模式下能保证数据至少写入kafka一次,即使任务失败 NON 不提供任何保证:如果 Kafka 代理出现问题,消息可能会丢失,并且消息可能会重复,该模式下,任务失败重试可能会产生数据丢失或重复。 ### 分区关键字段 例如,如果你想使用上游数据中的字段值作为键,可以将这些字段名指定给此属性 上游数据如下所示: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | 如果将 name 设置为 key,那么 name 列的哈希值将决定消息发送到哪个分区。 如果没有设置分区键字段,则将发送空消息键。 消息 key 的格式为 json,如果设置 name 为 key,例如 `{"name":"Jack"}`。 所选的字段必须是上游数据中已存在的字段。 ### Kafka Headers 字段 例如,如果你想使用上游数据中的字段值作为 kafka 消息的 headers,可以将这些字段名指定给此属性。 上游数据如下所示: | name | age | data | source | traceId | |------|-----|---------------|--------|-----------| | Jack | 16 | data-example1 | web | trace-123 | | Mary | 23 | data-example2 | mobile | trace-456 | 如果将 source 和 traceId 设置为 kafka headers 字段,那么这些字段值将作为 headers 添加到 kafka 消息中。 例如,第一行将具有 headers:`source=web` 和 `traceId=trace-123`。 字段值将被转换为字符串并用作 header 值。 所选的字段必须是上游数据中已存在的字段。 注意: 配置为 Kafka headers 的字段将不会包含在消息的 value(payload)中,而只会存在于 Kafka 消息的 headers 中。 ### 分区分配 假设总有五个分区,配置中的 assign_partitions 字段设置为: assign_partitions = ["shoe", "clothing"] 在这种情况下,包含 "shoe" 的消息将被发送到第零个分区,因为 "shoe" 在 assign_partitions 中被标记为零, 而包含 "clothing" 的消息将被发送到第一个分区。 对于其他的消息,我们将使用哈希算法将它们均匀地分配到剩余的分区中。 这个功能是通过 MessageContentPartitioner 类实现的,该类实现了 org.apache.kafka.clients.producer.Partitioner 接口。如果我们需要自定义分区,我们需要实现这个接口。 ## 任务示例 ### 简单 > 此示例展示了如何定义一个 SeaTunnel 同步任务,该任务能够通过 FakeSource 自动产生数据并将其发送到 Kafka Sink。在这个例子中,FakeSource 会生成总共 16 行数据(`row.num=16`),每一行都包含两个字段,即 `name`(字符串类型)和 `age`(整型)。最终,这些数据将被发送到名为 test_topic 的 topic 中,因此该 topic 也将包含 16 行数据。 > 如果你还未安装和部署 SeaTunnel,你需要参照 [安装SeaTunnel](../../getting-started/locally/deployment.md) 的指南来进行安装和部署。完成安装和部署后,你可以按照 [快速开始使用 SeaTunnel 引擎](../../getting-started/locally/quick-start-seatunnel-engine.md) 的指南来运行任务。 ```hocon # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { kafka { topic = "test_topic" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } } } ``` ### AWS MSK SASL/SCRAM 将以下 `${username}` 和 `${password}` 替换为 AWS MSK 中的配置值。 ```hocon sink { kafka { topic = "seatunnel" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { security.protocol=SASL_SSL sasl.mechanism=SCRAM-SHA-512 sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" } } } ``` ### AWS MSK IAM 从 https://github.com/aws/aws-msk-iam-auth/releases 下载 `aws-msk-iam-auth-1.1.5.jar` 并将其放入 `$SEATUNNEL_HOME/plugin/kafka/lib` 中目录。 请确保 IAM 策略具有 `kafka-cluster:Connect` 如下配置: ```hocon "Effect": "Allow", "Action": [ "kafka-cluster:Connect", "kafka-cluster:AlterCluster", "kafka-cluster:DescribeCluster" ], ``` 接收器配置 ```hocon sink { kafka { topic = "seatunnel" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { security.protocol=SASL_SSL sasl.mechanism=AWS_MSK_IAM sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } } ``` ### Kerberos 认证示例 请在启动 SeaTunnel 之前设置 JVM 参数 `java.security.krb5.conf` 或更新 `/etc/krb5.conf` 中的默认 `krb5.conf`。 源配置示例: ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "localhost:9092" format = json kafka.request.timeout.ms = 60000 semantics = EXACTLY_ONCE kafka.config = { security.protocol = SASL_PLAINTEXT sasl.kerberos.service.name = kafka sasl.mechanism = GSSAPI sasl.jaas.config = "com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" } } } ``` ### Protobuf配置 `format` 设置为 `protobuf`,配置`protobuf`数据结构,`protobuf_message_name`和`protobuf_schema`参数 使用样例: ```hocon sink { kafka { topic = "test_protobuf_topic_fake_source" bootstrap.servers = "kafkaCluster:9092" format = protobuf kafka.request.timeout.ms = 60000 kafka.config = { acks = "all" request.timeout.ms = 60000 buffer.memory = 33554432 } protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ } } ``` ### format 如果需要写入Kafka原生的信息,可以参考下面的配置。 配置示例: ```hocon sink { kafka { topic = "test_topic_native_sink" bootstrap.servers = "kafkaCluster:9092" format = "NATIVE" } } ``` 输入参数要求如下: ```json { "headers": { "header1": "header1", "header2": "header2" }, "key": "dGVzdF9ieXRlc19kYXRh", "partition": 3, "timestamp": 1672531200000, "timestampType": "CREATE_TIME", "value": "dGVzdF9ieXRlc19kYXRh" } ``` Note:key/value 需要 byte[]类型. ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Kingbase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Kingbase > JDBC Kingbase Sink 连接器 ## 支持连接器版本 - 8.6 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 > 使用 `Xa transactions` 来确保 `精确一次`。因此仅支持支持 `Xa transactions` 的数据库的 `精确一次`。您可以设置 `is_exactly_once=true` 来启用它。Kingbase 目前不支持 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | URL | Maven | |--------|-----------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------| | Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' > 工作目录
    > 例如:cp kingbase8-8.6.0.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | Kingbase 数据类型 | SeaTunnel 数据类型 | |----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL | BOOLEAN | | INT2 | SHORT | | SMALLSERIAL
    SERIAL
    INT4 | INT | | INT8
    BIGSERIAL | BIGINT | | FLOAT4 | FLOAT | | FLOAT8 | DOUBLE | | NUMERIC | DECIMAL((获取指定列的指定列大小),
    (获取指定列小数点右边的位数。))) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT | STRING | | TIMESTAMP | LOCALDATETIME | | TIME | LOCALTIME | | DATE | LOCALDATE | | 其他数据类型 | 暂不支持 | ## Sink 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-------------------------------------------|---------|------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:db2://127.0.0.1:50000/dbname | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,
    如果使用 DB2,则值为 `com.ibm.db2.jdbc.app.DB2Driver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 否 | - | 使用此 sql 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 具有更高的优先级 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成 sql 并接收上游输入数据写入数据库。
    此选项与 `query` 互斥,具有更高的优先级。 | | table | String | 否 | - | 使用数据库和此 table-name 自动生成 sql 并接收上游输入数据写入数据库。
    此选项与 `query` 互斥,具有更高的优先级。 | | primary_keys | Array | 否 | - | 此选项用于在自动生成 sql 时支持 `insert`、`delete` 和 `update` 等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败的重试次数 (executeBatch) | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录数达到 `batch_size` 数量或时间达到 `checkpoint.interval` 时
    ,数据将被刷新到数据库 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,这将使用 Xa 事务。如果启用,您需要
    设置 `xa_data_source_class_name`。Kingbase 目前不支持 | | generate_sink_sql | Boolean | 否 | false | 根据您要写入的数据库表生成 sql 语句 | | xa_data_source_class_name | String | 否 | - | 数据库驱动程序的 xa 数据源类名,Kingbase 目前不支持 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时时间,默认为 -1(永不超时)。请注意,设置超时可能会影响
    精确一次语义 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交 | | common-options | | 否 | - | Sink 插件通用参数,请参考 [Sink 通用选项](../common-options/sink-common-options.md) 详见 | | enable_upsert | Boolean | 否 | true | 如果存在 primary_keys,启用 upsert。如果任务没有重复数据,将此参数设置为 `false` 可以加快数据导入 | ### 提示 > 如果未设置 partition_column,它将以单并发运行,如果设置了 partition_column,它将根据任务的并发性并行执行。 ## 任务示例 ### 简单 > 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 JDBC Sink。FakeSource 生成总共 16 行数据 (row.num=16),每行有 12 个字段。最终目标表 test_table 也将有 16 行数据。 > 在运行此作业之前,您需要在 Kingbase 中创建数据库 test 和表 test_table。如果您还没有安装和部署 SeaTunnel,您需要按照 [安装 SeaTunnel](../../getting-started/locally/deployment.md) 中的说明进行安装和部署。然后按照 [使用 SeaTunnel 引擎快速开始](../../getting-started/locally/quick-start-seatunnel-engine.md) 中的说明运行此作业。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_date = date c_time = time c_timestamp = timestamp } } } # 如果您想了解更多关于如何配置 seatunnel 和查看源插件的完整列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果您想了解更多关于如何配置 seatunnel 和查看转换插件的完整列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:kingbase8://127.0.0.1:54321/dbname" driver = "com.kingbase8.Driver" username = "root" password = "123456" query = "insert into test_table(c_string,c_boolean,c_tinyint,c_smallint,c_int,c_bigint,c_float,c_double,c_decimal,c_date,c_time,c_timestamp) values(?,?,?,?,?,?,?,?,?,?,?,?)" } # 如果您想了解更多关于如何配置 seatunnel 和查看 sink 插件的完整列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成 Sink SQL > 此示例不需要编写复杂的 sql 语句,您可以配置数据库名称表名称来自动为您生成添加语句 ``` sink { jdbc { url = "jdbc:kingbase8://127.0.0.1:54321/dbname" driver = "com.kingbase8.Driver" username = "root" password = "123456" # 根据数据库表名自动生成 sql 语句 generate_sink_sql = true database = test table = test_table } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Kudu.md ================================================ import ChangeLog from '../changelog/connector-kudu.md'; # Kudu > Kudu数据接收器 ## 支持Kudu版本 - 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## 数据类型映射 | SeaTunnel 数据类型 | Kudu 数据类型 | |---------------------|--------------------------| | BOOLEAN | BOOL | | INT | INT8
    INT16
    INT32 | | BIGINT | INT64 | | DECIMAL | DECIMAL | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | TIMESTAMP | UNIXTIME_MICROS | | BYTES | BINARY | ## Sink 选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |-------------------------------------------|--------|----------|------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| | kudu_masters | String | 是 | - | Kudu主地址。用“,”分隔,例如“192.168.88.110:7051”。 | | table_name | String | 是 | - | Kudu表的名字。 | | client_worker_count | Int | 否 | 2 * Runtime.getRuntime().availableProcessors() | Kudu工人数。默认值是当前cpu核数的两倍。 | | client_default_operation_timeout_ms | Long | 否 | 30000 | Kudu正常运行超时。 | | client_default_admin_operation_timeout_ms | Long | 否 | 30000 | Kudu管理员操作超时。 | | enable_kerberos | Bool | 否 | false | 启用Kerberos主体。 | | kerberos_principal | String | 否 | - | Kerberos主体。请注意,所有zeta节点都需要此文件。 | | kerberos_keytab | String | 否 | - | Kerberos密钥表。请注意,所有zeta节点都需要此文件。 | | kerberos_krb5conf | String | 否 | - | Kerberos krb5 conf.请注意,所有zeta节点都需要此文件。 | | save_mode | String | 否 | - | 存储模式,支持 `overwrite` 和 `append`. | | session_flush_mode | String | 否 | AUTO_FLUSH_SYNC | Kudu刷新模式。默认AUTO_FLUSH_SYNC。 | | batch_size | Int | 否 | 1024 | 超过此记录数的刷新最大大小(包括所有追加、追加和删除记录)将刷新数据。默认值为100 | | buffer_flush_interval | Int | 否 | 10000 | 刷新间隔期间,异步线程将刷新数据。 | | ignore_not_found | Bool | 否 | false | 如果为true,则忽略所有未找到的行。 | | ignore_not_duplicate | Bool | 否 | false | 如果为true,则忽略所有dulicate行。 | | common-options | | 否 | - |源插件常用参数,详见[Source common Options](../sink common-Options.md)。 | ## 任务示例 ### 简单示例 > 以下示例引用了FakeSource kudu写入表kudu_sink_table ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "kudu" schema = { fields { id = int val_bool = boolean val_int8 = tinyint val_int16 = smallint val_int32 = int val_int64 = bigint val_float = float val_double = double val_decimal = "decimal(16, 1)" val_string = string val_unixtime_micros = timestamp } } rows = [ { kind = INSERT fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = INSERT fields = [3, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = UPDATE_BEFORE fields = [1, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = UPDATE_AFTER fields = [1, true, 2, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] }, { kind = DELETE fields = [2, true, 1, 2, 3, 4, 4.3,5.3,6.3, "NEW", "2020-02-02T02:02:02"] } ] } } sink { kudu{ plugin_input = "kudu" kudu_masters = "kudu-master-cdc:7051" table_name = "kudu_sink_table" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } ``` ### 多表 #### 示例1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { kudu{ kudu_masters = "kudu-master-cdc:7051" table_name = "${database_name}_${table_name}_test" } } ``` #### 示例2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { kudu{ kudu_masters = "kudu-master-cdc:7051" table_name = "${schema_name}_${table_name}_test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Lance.md ================================================ import ChangeLog from '../changelog/connector-lance.md'; # Lance > Lance sink 连接器 ## 支持的引擎 > Spark(不支持 Spark 3.4 以下版本,参考 https://lance.org/integrations/spark/install/#scala)
    > Flink(暂不支持,参考 https://github.com/lance-format/lance-flink)
    > SeaTunnel Zeta
    ## 描述 Lance 格式的 Sink 连接器。支持创建和写入数据集、Lance 命名空间管理 schema 和版本。 ## 主要特性 - [] [精确一次语义](../../introduction/concepts/connector-v2-features.md) ## 依赖 com.lancedb lance-core 0.33.0 com.lancedb lance-namespace-core 0.0.14 ## Sink 配置项 | Name | Type | Required | Default | Description | |-----------------|--------|----------|---------|---------------------------------------------------------| | dataset_path | string | yes | /tmp | Lance sink 连接的数据集路径 . | | namespace_type | string | yes | dir | Lance 数据集的命名空间类型,目前仅支持 DirectoryNamespace,类型默认为 "dir" | | table | string | yes | test | Lance 数据集的名称,如果未设置,数据集名称默认为 test | | namespace_id | string | no | - | Lance 命名空间的 ID。请参考 https://lance.org/format/namespace/ | ## 数据类型映射 Lance 的数据类型依赖于 Arrow 数据类型系统 | Seatunnel数据类型 | Lance 数据类型 | |---------------|--------------| | BOOLEAN | bool/boolean | | TINYINT | int8 | | SMALLINT | int16 | | INT | int32 | | BIGINT | int64 | | FLOAT | float16 | | DOUBLE | float32 | | BYTES | binary | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | string/utf8 | ## 任务示例 ### 简单示例 ```hocon env { parallelism = 1 job.mode = "BATCH" # 可以在这里设置 Spark 配置 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local } source { FakeSource { row.num = 100 schema = { fields { c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } plugin_output = "fake" } } transform { } sink { Lance { dataset_path = "/tmp/seatunnel_mnt/lanceTest/lance_sink_table" namespace_type = "dir" namespace_id = "root" table = "lance_sink_table" } } ``` ## 更新日志 ================================================ FILE: docs/zh/connectors/sink/LocalFile.md ================================================ import ChangeLog from '../changelog/connector-file-local.md'; # LocalFile > 本地文件接收器 ## 描述 将数据输出到本地文件。 :::tip 提示 如果你使用的是 spark/flink,为了使用此连接器,你必须确保你的 spark/flink 集群已集成 hadoop。已测试的 hadoop 版本是 2.x。 如果你使用 SeaTunnel Engine,它会在下载和安装 SeaTunnel Engine 时自动集成 hadoop jar。你可以在 ${SEATUNNEL_HOME}/lib 下检查 jar 包以确认这一点。 ::: ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用 2PC 提交以确保`精确一次`。 - [x] 文件格式类型 - [x] 文本 - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] 二进制 - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |---------------------------------------|---------|------|--------------------------------------------|-----------------------------------------------------------------| | path | string | 是 | - | 目标目录路径 | | tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用 `mv` 将临时目录提交到目标目录。 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在 custom_filename 为 true 时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在 custom_filename 为 true 时使用 | | file_format_type | string | 否 | "csv" | 文件格式类型 | | filename_extension | string | 否 | - | 使用自定义的文件扩展名覆盖默认的文件扩展名。 例如:`.xml`, `.json`, `dat`, `.customtype` | | field_delimiter | string | 否 | '\001' | 仅在 file_format_type 为 text 时使用 | | row_delimiter | string | 否 | "\n" | 仅在 file_format_type 为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区 | | partition_by | array | 否 | - | 仅在 have_partition 为 true 时使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅在 have_partition 为 true 时使用 | | is_partition_field_write_in_file | boolean | 否 | false | 仅在 have_partition 为 true 时使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是 sink 列 | | is_enable_transaction | boolean | 否 | true | 是否启用事务 | | batch_size | int | 否 | 1000000 | 批量大小 | | single_file_mode | boolean | 否 | false | 每个并行度只会输出一个文件,当此参数开启时,batch_size就不会生效。输出的文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,依然生成对应的数据文件。 | | compress_codec | string | 否 | none | 压缩编码 | | common-options | object | 否 | - | 常见选项 | | max_rows_in_memory | int | 否 | - | 仅在 file_format_type 为 excel 时使用 | | sheet_name | string | 否 | Sheet${随机数} | 仅在 file_format_type 为 excel 时使用 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在文件格式为 CSV 时使用。 | | xml_root_tag | string | 否 | RECORDS | 仅在 file_format 为 xml 时使用 | | xml_row_tag | string | 否 | RECORD | 仅在 file_format 为 xml 时使用 | | xml_use_attr_format | boolean | 否 | - | 仅在 file_format 为 xml 时使用 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在 file_format 为 parquet 时使用 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在 file_format 为 parquet 时使用 | | enable_header_write | boolean | 否 | false | 仅在 file_format_type 为 text,csv 时使用。
    false:不写入表头,true:写入表头。 | | encoding | string | 否 | "UTF-8" | 仅在 file_format_type 为 json,text,csv,xml 时使用 | | schema_save_mode | string | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 现有目录处理方式 | | data_save_mode | string | 否 | APPEND_DATA | 现有数据处理方式 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### path [string] 目标目录路径是必需的,你可以通过使用 `${database_name}`、`${table_name}` 和 `${schema_name}` 将上游的 CatalogTable 注入到路径中。 ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅在 `custom_filename` 为 `true` 时使用 `file_name_expression` 描述将创建到 `path` 中的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}` 或 `${uuid}`,例如 `test_${uuid}_${now}`,`${now}` 表示当前时间,其格式可以通过指定 `filename_time_format` 选项来定义。 请注意,如果 `is_enable_transaction` 为 `true`,我们将自动在文件名的头部添加 `${transactionId}_`。 ### filename_time_format [string] 仅在 `custom_filename` 为 `true` 时使用 当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下所示: | 符号 | 描述 | |----|-----------| | y | 年 | | M | 月 | | d | 日 | | H | 小时 (0-23) | | m | 分钟 | | s | 秒 | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终的文件名将以 file_format_type 的后缀结尾,文本文件的后缀是 `txt`。 ### field_delimiter [string] 数据行中列之间的分隔符。仅在 `text` 文件格式下需要。 ### row_delimiter [string] 文件中行之间的分隔符。仅在 `text`、`csv`、`json` 文件格式下需要。 ### have_partition [boolean] 是否需要处理分区。 ### partition_by [array] 仅在 `have_partition` 为 `true` 时使用。 基于选定字段进行数据分区。 ### partition_dir_expression [string] 仅在 `have_partition` 为 `true` 时使用。 如果指定了 `partition_by`,我们将基于分区信息生成相应的分区目录,最终文件将放置在分区目录中。 默认的 `partition_dir_expression` 是 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 ### is_partition_field_write_in_file [boolean] 仅在 `have_partition` 为 `true` 时使用。 如果 `is_partition_field_write_in_file` 为 `true`,分区字段及其值将写入数据文件。 例如,如果你想写入一个 Hive 数据文件,其值应该为 `false`。 ### sink_columns [array] 需要写入文件的列,默认值为从 `Transform` 或 `Source` 获取的所有列。字段的顺序决定了实际写入文件的顺序。 ### is_enable_transaction [boolean] 如果 `is_enable_transaction` 为 true,我们将确保数据在写入目标目录时不会丢失或重复。 请注意,如果 `is_enable_transaction` 为 true,我们将自动在文件名前添加 `${transactionId}_`。 目前仅支持 `true`。 ### batch_size [int] 文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,sink writer 将在文件中的行数超过 `batch_size` 时写入文件。如果 `checkpoint.interval` 很小,当触发新检查点时,sink writer 将创建一个新文件。 ### compress_codec [string] 文件的压缩编码,支持的压缩编码如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 提示:excel 类型不支持任何压缩格式 ### 常见选项 Sink 插件的常见参数,请参阅 [Sink 常见选项](../common-options/sink-common-options.md) 获取详细信息。 ### max_rows_in_memory [int] 当文件格式为 Excel 时,内存中可以缓存的数据项最大数量。 ### sheet_name [string] 工作簿的表名。 ### csv_string_quote_mode [string] 当文件格式为 CSV 时,CSV 的字符串引号模式。 - ALL:所有字符串字段都会加引号。 - MINIMAL:仅为包含特殊字符(如字段分隔符、引号字符或行分隔符字符串中的任何字符)的字段加引号。 - NONE:从不为字段加引号。当数据中包含分隔符时,输出会在前面加上转义字符。如果未设置转义字符,则格式验证会抛出异常。 ### xml _root_tag [string] 指定 XML 文件中根元素的标签名。 ### xml_row_tag [string] 指定 XML 文件中数据行的标签名。 ### xml_use_attr_format [boolean] 指定是否使用标签属性格式处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入 Parquet INT96,仅对 parquet 文件有效。 ### parquet_avro_write_fixed_as_int96 [array] 支持从 12 字节字段写入 Parquet INT96,仅对 parquet 文件有效。 ### enable_header_write [boolean] 仅在 file_format_type 为 text,csv 时使用。false:不写入表头,true:写入表头。 ### encoding [string] 仅在 file_format_type 为 json,text,csv,xml 时使用。文件写入的编码。该参数将通过 `Charset.forName(encoding)` 解析。 ### schema_save_mode [string] 现有的目录处理方法。 - RECREATE_SCHEMA:当目录不存在时创建,当目录存在时删除并重新创建 - CREATE_SCHEMA_WHEN_NOT_EXIST:当目录不存在时创建,当目录存在时跳过 - ERROR_WHEN_SCHEMA_NOT_EXIST:当目录不存在时,将报告错误 - IGNORE:忽略对表的处理 ### data_save_mode [string] 现有的数据处理方法。 - DROP_DATA:保留目录并删除数据文件 - APPEND_DATA:保留目录,保留数据文件 - ERROR_WHEN_DATA_EXISTS:当有数据文件时,会报告错误 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 示例 对于 orc 文件格式的简单配置 ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } ``` 对于带有 `encoding` 的 json、text、csv 或 xml 文件格式 ```hocon LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" encoding = "gbk" } ``` 对于带有 `sink_columns` 的 parquet 文件格式 ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "parquet" sink_columns = ["name","age"] } ``` 对于带有 `have_partition`、`custom_filename` 和 `sink_columns` 的 text 文件格式 ```bash LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` 对于带有 `sheet_name` 和 `max_rows_in_memory` 的 excel 文件格式 ```bash LocalFile { path="/tmp/seatunnel/excel" sheet_name = "Sheet1" max_rows_in_memory = 1024 partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type="excel" filename_time_format="yyyy.MM.dd" is_enable_transaction=true } ``` 对于从上游提取源元数据,可以在路径中使用 `${database_name}`、`${table_name}` 和 `${schema_name}`。 ```bash LocalFile { path = "/tmp/hive/warehouse/${table_name}" file_format_type = "parquet" sink_columns = ["name","age"] } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Maxcompute.md ================================================ import ChangeLog from '../changelog/connector-maxcompute.md'; # Maxcompute > Maxcompute Sink 连接器 ## 描述 用于从 Maxcompute 读取数据。 ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | |----------------|---------|------|--------| | accessId | string | 是 | - | | accesskey | string | 是 | - | | endpoint | string | 是 | - | | project | string | 是 | - | | table_name | string | 是 | - | | partition_spec | string | 否 | - | | overwrite | boolean | 否 | false | | insert_strategy| string | no | upload | | common-options | string | 否 | | ### accessId [string] `accessId` 您的 Maxcompute accessId,可从阿里云访问。 ### accesskey [string] `accesskey` 您的 Maxcompute accessKey,可从阿里云访问。 ### endpoint [string] `endpoint` 您的 Maxcompute endpoint,以 http 开头。 ### project [string] `project` 您在阿里云中创建的 Maxcompute 项目。 ### table_name [string] `table_name` 目标 Maxcompute 表名,例如:fake。 ### partition_spec [string] `partition_spec` Maxcompute 分区表的规范,例如:ds='20220101'。 ### overwrite [boolean] `overwrite` 是否覆盖表或分区,默认值:false。 ### save_mode_create_template 我们使用模板来自动创建 MaxCompute 表, 它将根据上游数据和模式类型的类型创建相应的表创建语句, 默认模板可以根据情况进行修改。目前仅在多表模式下工作。 默认模板: ```sql CREATE TABLE IF NOT EXISTS `${table}` ( ${rowtype_fields} ) COMMENT '${comment}'; ``` 如果在模板中填入自定义字段,例如添加 `id` 字段 ```sql CREATE TABLE IF NOT EXISTS `${table}` ( id, ${rowtype_fields} ) COMMENT '${comment}'; ``` 连接器将自动从上游获取相应的类型来完成填充, 并从 `rowtype_fields` 中删除 id 字段。此方法可用于自定义修改字段类型和属性。 您可以使用以下占位符 - database:用于获取上游模式中的数据库 - table_name:用于获取上游模式中的表名 - rowtype_fields:用于获取上游模式中的所有字段,我们将自动映射到 MaxCompute 的字段描述 - rowtype_primary_key:用于获取上游模式中的主键(可能是列表) - rowtype_unique_key:用于获取上游模式中的唯一键(可能是列表) - comment:用于获取上游模式中的表注释 ### schema_save_mode [Enum] 在同步任务打开之前,为目标端现有的表结构选择不同的处理方案。 选项介绍: `RECREATE_SCHEMA` :表不存在时将创建,表已保存时删除并重建。如果设置了 `partition_spec`,分区将被删除并重建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :表不存在时将创建,表已保存时跳过。如果设置了 `partition_spec`,分区将被创建。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :表不存在时将报错 `IGNORE` :忽略表的处理 ### data_save_mode [Enum] 在同步任务打开之前,为目标端现有的数据选择不同的处理方案。 选项介绍: `DROP_DATA`:保留数据库结构并删除数据 `APPEND_DATA`:保留数据库结构,保留数据 `CUSTOM_PROCESSING`:用户定义的处理 `ERROR_WHEN_DATA_EXISTS`:当存在数据时,报错 ### custom_sql [String] 当 data_save_mode 选择 CUSTOM_PROCESSING 时,您应该填入 CUSTOM_SQL 参数。此参数通常填入可以执行的 SQL。SQL 将在同步任务之前执行。 ### datetime_format [String] 用户定义的格式字符串,用于将 LocalDateTime 字段转换为字符串。 当您想指定与 DateTimeUtils.Formatter 中的预定义值之一匹配的自定义日期时间格式时,请使用此选项(例如 yyyy-MM-dd HH:mm:ss、yyyyMMddHHmmss 等)。 示例值: - `yyyy-MM-dd HH:mm:ss` - `yyyy-MM-dd HH:mm:ss.SSSSSS` - `yyyy.MM.dd HH:mm:ss` - `yyyy/MM/dd HH:mm:ss` - `yyyy/M/d HH:mm` - `yyyy-M-d HH:mm` - `yyyy/M/d HH:mm:ss` - `yyyy-M-d HH:mm:ss` - `yyyyMMddHHmmss` 默认值:`yyyy-MM-dd HH:mm:ss` ### tunnel_endpoint [String] 指定 MaxCompute Tunnel 服务的自定义端点 URL。 默认情况下,端点是从配置的区域自动推断的。 此选项允许您覆盖默认行为并使用自定义 Tunnel 端点。 如果未指定,连接器将使用基于区域的默认 Tunnel 端点。 通常,您**不需要**设置 tunnel_endpoint。仅在自定义网络、调试或本地开发时才需要。 示例值: - `https://dt.cn-hangzhou.maxcompute.aliyun.com` - `https://dt.ap-southeast-1.maxcompute.aliyun.com` - `http://maxcompute:8080` 默认值:未设置(从区域自动推断) ### insert_strategy [string] 如果将 `insert_strategy` 设置为 `upload`,插入操作将使用 upload 会话。 如果设置为 `upsert`,插入操作将使用 upsert 会话。Upsert 会话 需要主键。 注意: 在同时存在更新或删除操作的情况下,使用 upload 会话进行插入操作,可能会导致插入的记录 比预期更晚出现在表中。 当表中存在主键时,建议将 `insert_strategy` 设置为 `upsert`,以确保一致的 upsert 行为。 ### 通用选项 Sink 插件通用参数,请参考 [Sink 通用选项](../common-options/sink-common-options.md) 详见。 ## 示例 ```hocon sink { Maxcompute { accessId="" accesskey="" endpoint="" project="" table_name="" #partition_spec="" #overwrite = false } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Milvus.md ================================================ import ChangeLog from '../changelog/connector-milvus.md'; # Milvus > Milvus数据接收器 ## 描述 Milvus sink连接器将数据写入Milvus或Zilliz Cloud,它具有以下功能: - 支持按分区读写数据 - 支持从元数据列写入动态模式数据 - json数据将转换为json字符串进行写入 - 自动重试以绕过 ratelimit 限制 和 grpc 限制 ## 主要特性 - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) ##数据类型映射 | Milvus数据类型 | SeaTunnel 数据类型 | |---------------------|---------------------| | INT8 | TINYINT | | INT16 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOL | BOOLEAN | | JSON | STRING | | ARRAY | ARRAY | | VARCHAR | STRING | | FLOAT_VECTOR | FLOAT_VECTOR | | BINARY_VECTOR | BINARY_VECTOR | | FLOAT16_VECTOR | FLOAT16_VECTOR | | BFLOAT16_VECTOR | BFLOAT16_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | ## Sink 选项 | 名字 | 类型 | 是否必传 | 默认值 | 描述 | |------------------------|---------------------|------|------------------------------|---------------------------------------------------------------------| | url | String | 是 | - | 连接到Milvus或Zilliz Cloud的URL。 | | token | String | 是 | - | 用户:密码 | | database | String | 否 | - | 将数据写入哪个数据库,默认为源数据库。 | | schema_save_mode | enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 当表不存在时自动创建表。 | | enable_auto_id | boolean | 否 | false | 主键列启用autoId。 | | enable_upsert | boolean | 否 | false | 是否启用upsert。 | | enable_dynamic_field | boolean | 否 | true | 是否启用带动态字段的创建表。 | | batch_size | int | 否 | 1000 | 写入批大小。当缓冲记录数达到 `batch_size` 或时间达到 `checkpoint.interval` 时,将触发一次写入刷新 | | partition_key | String | 否 | | Milvus分区键字段 | | create_index | boolean | No | false | 自动为集合创建向量索引以提高查询性能 | | load_collection | boolean | No | false | 将集合加载到 Milvus 内存中以便立即进行查询 | | collection_description | Map | No | {} | 集合描述映射,其中键是集合名称,值是描述 | ## 任务示例 ### 基础配置 ```bash sink { Milvus { url = "http://127.0.0.1:19530" token = "username:password" batch_size = 1000 } } ``` ### 带 Index 和 Loading 的高级配置 ```bash sink { Milvus { url = "http://127.0.0.1:19530" token = "username:password" batch_size = 1000 create_index = true load_collection = true collection_description = { "user_vectors" = "User embedding vectors for recommendation" "product_vectors" = "Product feature vectors for search" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/MongoDB.md ================================================ import ChangeLog from '../changelog/connector-mongodb.md'; # MongoDB > MongoDB 数据接收(Sink)连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [exactly-once 精准一次写入](../../introduction/concepts/connector-v2-features.md) - [x] [CDC(变更数据捕获)](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) **提示** > 1. 如果希望使用 CDC 写入功能,建议启用 `upsert-enable` 配置项。 ## 介绍 MongoDB 连接器提供从 MongoDB 读取数据以及向 MongoDB 写入数据的能力。 本文档将介绍如何配置 MongoDB 连接器,以便执行向 MongoDB 写入数据的任务。 ## 支持的数据源信息 要使用 MongoDB 连接器,需要以下依赖。 可通过 `install-plugin.sh` 下载,或从 Maven 中央仓库获取。 | 数据源 | 支持版本 | 依赖 | |---------|------------|---------| | MongoDB | 通用版本 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-mongodb) | ## 数据类型映射 以下表格展示了 MongoDB BSON 类型与 SeaTunnel 数据类型之间的映射关系。 | SeaTunnel 数据类型 | MongoDB BSON 类型 | |--------------------|-------------------| | STRING | ObjectId | | STRING | String | | BOOLEAN | Boolean | | BINARY | Binary | | INTEGER | Int32 | | TINYINT | Int32 | | SMALLINT | Int32 | | BIGINT | Int64 | | DOUBLE | Double | | FLOAT | Double | | DECIMAL | Decimal128 | | Date | Date | | Timestamp | Timestamp / Date | | ROW | Object | | ARRAY | Array | **提示** > 1. 使用 SeaTunnel 将 `Date` 和 `Timestamp` 类型写入 MongoDB 时,MongoDB 中都会生成 `Date` 类型字段,但精度不同:SeaTunnel 的 `Date` 类型精度为秒,`Timestamp` 类型精度为毫秒。
    > 2. 当使用 `DECIMAL` 类型时,最大精度不能超过 34 位,也就是说应使用 `decimal(34, 18)`。 ## Sink 参数说明 | 参数名称 | 类型 | 是否必填 | 默认值 | 说明 | |-----------------------|----------|----------|--------|------| | uri | String | 是 | - | MongoDB 标准连接 URI,例如:`mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true`。 | | database | String | 是 | - | 要读取或写入的 MongoDB 数据库名称。配置多表同步时,可使用占位符 `${database_name}`,例如:`database = "${database_name}_test_database"`。 | | collection | String | 是 | - | 要读取或写入的 MongoDB 集合名称。配置多表同步时,可使用 `${table_name}`、`${schema_name}` 等占位符,例如:`collection = "${database_name}_${schema_name}_${table_name}_check"`。 | | buffer-flush.max-rows | String | 否 | 1000 | 每次批量写入请求的最大缓存行数。 | | buffer-flush.interval | String | 否 | 30000 | 批量写入的最大时间间隔(毫秒)。 | | retry.max | String | 否 | 3 | 写入失败时的最大重试次数。 | | retry.interval | Duration | 否 | 1000 | 写入失败后的重试间隔时间(毫秒)。 | | upsert-enable | Boolean | 否 | false | 是否启用 upsert 模式进行写入。 | | primary-key | List | 否 | - | 用于 upsert 或更新操作的主键,格式为 `["id","name",...]`。 | | transaction | Boolean | 否 | false | 是否在 MongoSink 中使用事务(需要 MongoDB 4.2+)。 | | common-options | - | 否 | - | 通用 Sink 插件参数,详见 [Sink Common Options](../common-options/sink-common-options.md)。 | | data_save_mode | String | 否 | APPEND_DATA | 数据写入模式:
    - `DROP_DATA`: 插入数据前清空集合;
    - `APPEND_DATA`: 追加数据;
    - `ERROR_WHEN_DATA_EXISTS`: 如果集合已有数据则报错。 | ### 提示 > 1. MongoDB Sink 连接器的数据刷新逻辑由以下三个参数共同控制:`buffer-flush.max-rows`、`buffer-flush.interval` 和 `checkpoint.interval`。 > 任一条件满足时,都会触发数据刷写。
    > 2. 兼容历史参数 `upsert-key`。若已设置 `upsert-key`,请勿同时设置 `primary-key`。 ## 如何创建 MongoDB 数据同步任务 下面示例展示了一个将随机生成的数据写入 MongoDB 的数据同步任务: ```bash # 设置作业的基本配置 env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 1000 } source { FakeSource { row.num = 2 bigint.min = 0 bigint.max = 10000000 split.num = 1 split.read-interval = 300 schema { fields { c_bigint = bigint } } } } sink { MongoDB { uri = mongodb://user:password@127.0.0.1:27017 database = "test" collection = "test" } } ``` ## 参数详解 ### MongoDB 数据库连接 URI 示例 无认证的单节点连接: ```bash mongodb://127.0.0.0:27017/mydb ``` 副本集连接: ```bash mongodb://127.0.0.0:27017/mydb?replicaSet=xxx ``` 带认证的副本集连接: ```bash mongodb://admin:password@127.0.0.0:27017/mydb?replicaSet=xxx&authSource=admin ``` 多节点副本集连接: ```bash mongodb://127.0.0.1:27017,127.0.0.2:27017,127.0.0.3:27017/mydb?replicaSet=xxx ``` 分片集群连接: ```bash mongodb://127.0.0.0:27017/mydb ``` 多个 mongos 节点连接: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb ``` 注意:URI 中的用户名与密码在拼接前必须进行 URL 编码。 ### Buffer Flush 示例 ```bash sink { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" buffer-flush.max-rows = 2000 buffer-flush.interval = 1000 } } ``` ### 为什么不推荐频繁使用事务? 虽然 MongoDB 自 4.2 版本起已完全支持多文档事务,但这并不意味着所有场景都应使用。 事务意味着加锁、节点协调、额外开销和性能损耗。 设计系统时应遵循的原则是:**能不用事务就不要用事务**。 合理的系统设计可以在大多数情况下避免对事务的依赖。 ### 幂等写入(Idempotent Writes) 通过定义明确的主键并启用 `upsert` 模式,可以实现精准一次写入(exactly-once)语义。 当配置中定义了 `primary-key` 且启用了 `upsert-enable`,MongoDB Sink 将使用 Upsert 语义而非普通 INSERT 语句。 SeaTunnel 会将定义的主键作为 MongoDB 的复合主键,在 Upsert 模式下进行写入,以确保幂等性。 若作业在运行过程中失败,SeaTunnel 会从上一个成功的 checkpoint 恢复并重新处理数据,这可能导致重复数据。 强烈建议启用 Upsert 模式,以避免主键冲突或重复插入。 ```bash sink { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" upsert-enable = true primary-key = ["name","status"] } } ``` ## 更新日志 ================================================ FILE: docs/zh/connectors/sink/Mysql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # MySQL > JDBC Mysql Sink 连接器 ## 支持的Mysql版本 - 5.5/5.6/5.7/8.0/8.1/8.2/8.3/8.4 ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过jdbc写入数据。支持批处理模式和流模式,支持并发写入,支持exactly-once精确一次 语义(使用XA事务保证)。 ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要功能 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) >使用“Xa事务”来确保“精确一次”。因此,数据库只支持“精确一次”,即 >支持“Xa事务”。您可以设置`is_exactly_once=true `来启用它。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动器 | 网址 | Maven下载链接 | |-----|---------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------| | Mysql | 不同的依赖版本具有不同的驱动程序类。 | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [下载](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | ## 数据类型映射 | Mysql 数据类型 | SeaTunnel 数据类型 | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
    INT UNSIGNED | BOOLEAN | | TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(获取指定列的列大小<38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的列大小>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((DECIMAL((获取指定列的列大小)+1,
    (获取指定列的小数点右侧的位数))) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | | GEOMETRY
    UNKNOWN | Not supported yet | ## Sink 参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参见示例:
    `jdbc:mysql://localhost:3306:3306/test`。 | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,
    如果使用 MySQL,值为 `com.mysql.cj.jdbc.Driver`。 | | username | String | 否 | - | 连接实例用户名。 | | password | String | 否 | - | 连接实例密码。 | | query | String | 否 | - | 使用此sql将上游输入数据写入数据库。例如: `INSERT ...`,`query` 具有更高的优先级 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成sql并接收上游输入数据写入数据库。
    此选项与`query` 互斥,具有更高的优先级 | | table | String | 否 | - | 使用数据库和此表名自动生成sql并接收上游输入数据写入数据库。
    此选项与`query` 互斥,具有更高的优先级 | | primary_keys | Array | 否 | - | 此选项用于支持以下操作,例如 `insert`, `delete`, 和 `update` 当自动生成sql. | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败的重试次数(executeBatch) | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录的数量达到“batch_size”的数量或时间达到“checkpoint.interval”
    时,数据将被刷新到数据库中 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,这将使用Xa事务。如果启用,则需要
    设置`xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成sql语句 | | xa_data_source_class_name | String | 否 | - | 数据库Driver的xa数据源类名,例如mysql是`com.mysql.cj.jdbc。MysqlXADataSource,和
    请参阅附录了解其他数据源 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时,默认值为-1(永不超时)。请注意,设置超时可能会影响<br/>精确一次语义 | | auto_commit | Boolean | 否 | true | 默认情况下启用自动事务提交 | | field_ide | String | 否 | - | 确定从源同步到 Sink 时是否需要转换字段`ORIGINAL表示不需要转换`大写`表示转换为大写`LOWERCASE表示转换为小写。 | | properties | Map | 否 | - | 其他连接配置参数,当属性和URL具有相同的参数时,优先级由驱动程序的特定实现决定。例如,在MySQL中,属性优先于URL。 | | common-options | | 否 | - | Sink插件常用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 详见 | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在启动同步任务之前,对目标侧的现有表面结构选择不同的处理方案。 | | data_save_mode | Enum | 否 | APPEND_DATA | 在启动同步任务之前,对目标端的现有数据选择不同的处理方案。 | | custom_sql | String | 否 | - | 当data_save_mode选择CUSTOM_PROCESSING时,您应该填写CUSTOM_SQL参数。此参数通常填充可以执行的SQL。SQL将在同步任务之前执行。 | | enable_upsert | Boolean | 否 | true | 通过primary_keys存在启用upstart,如果任务只有“插入”,将此参数设置为“false”可以加快数据导入 | ### 提示 >如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发性并行执行。 ## 任务示例 ### 简单的例子 >此示例定义了一个SeaTunnel同步任务,该任务通过FakeSource自动生成数据并将其发送到JDBC Sink。FakeSource总共生成16行数据(row.num=16),每行有两个字段,name(字符串类型)和age(int类型)。最终的目标表是test_table,表中也将有16行数据。在运行此作业之前,您需要在mysql中创建数据库测试表test_table。如果您尚未安装和部署SeaTunnel,则需要按照[安装SeaTunnel](../../getting-started/locally/deployment.md)中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } #如果你想了解更多关于如何配置seatunnel的信息,并查看完整的源插件列表, #请前往https://seatunnel.apache.org/docs/connectors/source } transform { #如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表, #请前往https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } #如果你想了解更多关于如何配置seatunnel的信息,并查看完整的sink插件列表, #请前往https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成Sink SQL >此示例不需要编写复杂的sql语句,您可以配置数据库名称表名以自动为您生成add语句 ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" # Automatically generate sql statements based on database table names generate_sink_sql = true database = test table = test_table } } ``` ### 精确一次 为了准确的书写场景,我们保证精确一次 ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### CDC(变更数据捕获)事件 >我们也支持CDC变更数据。在这种情况下,您需要配置数据库、表和主键。 ``` sink { jdbc { url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] field_ide = UPPERCASE schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### 多表同步 #### 示例1:MySQL CDC 多表同步 > 通过 MySQL CDC 同步多张表到目标 MySQL 数据库,使用占位符实现动态表名映射 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Mysql { url = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true database = "${database_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` #### 示例2:JDBC Source 多表同步到 MySQL > 从 MySQL 使用 JDBC Source 批量同步多张表到另一个 MySQL 数据库 ``` env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = com.mysql.cj.jdbc.Driver url = "jdbc:mysql://localhost:3306/source_db" username = "root" password = "123456" table_list = [ { table_path = "source_db.table_1" }, { table_path = "source_db.table_2" } ] } } transform { } sink { Mysql { url = "jdbc:mysql://localhost:3306?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" username = "root" password = "123456" generate_sink_sql = true database = "${database_name}_target" table = "${table_name}_copy" primary_keys = ["${primary_key}"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Neo4j.md ================================================ import ChangeLog from '../changelog/connector-neo4j.md'; # Neo4j > Neo4j 写连接器 ## 描述 写数据到 `Neo4j`。 `neo4j-java-driver` version 4.4.9 ## 主要功能 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | |----------------------------|---------|------|----------| | uri | String | 是 | - | | username | String | 否 | - | | password | String | 否 | - | | max_batch_size | Integer | 否 | - | | write_mode | String | 否 | OneByOne | | bearer_token | String | 否 | - | | kerberos_ticket | String | 否 | - | | database | String | 是 | - | | query | String | 是 | - | | queryParamPosition | Object | 是 | - | | max_transaction_retry_time | Long | 否 | 30 | | max_connection_timeout | Long | 否 | 30 | | common-options | config | 否 | - | ### uri [string] `Neo4j`数据库的URI,参考配置: `neo4j://localhost:7687`。 ### username [string] `Neo4j`用户名。 ### password [string] `Neo4j`密码。如果提供了“用户名”,则需要。 ### max_batch_size [Integer] `max_batch_size` 是指写入数据时,单个事务中可以写入的最大数据条目数。 ### write_mode 默认值为 `oneByOne` ,如果您想批量写入,请将其设置为`Batch` ```cypher unwind $ttt as row create (n:Label) set n.name = row.name,n.age = rw.age ``` `ttt`代表一批数据。,`ttt`可以是任意字符串,只要它与配置的`batch_data_variable` 匹配。 ### bearer_token [string] `Neo4j`的`base64`编码`bearer token`用于鉴权。 ### kerberos_ticket [string] `Neo4j`的`base64`编码`kerberos ticket`用于鉴权。 ### database [string] 数据库名称。 ### query [string] 查询语句。包含在运行时用相应值替换的参数占位符。 ### queryParamPosition [object] 查询参数的位置映射信息。 键名是参数占位符名称。 关联值是字段在输入数据行中的位置。 ### max_transaction_retry_time [long] 最大事务重试时间(秒)。如果超过,则交易失败。 ### max_connection_timeout [long] 等待TCP连接建立的最长时间(秒)。 ### common options Sink插件常用参数, 详细信息请参考 [Sink公共配置](../common-options/sink-common-options.md) ## OneByOne模式写示例 ``` sink { Neo4j { uri = "neo4j://localhost:7687" username = "neo4j" password = "1234" database = "neo4j" max_transaction_retry_time = 10 max_connection_timeout = 10 query = "CREATE (a:Person {name: $name, age: $age})" queryParamPosition = { name = 0 age = 1 } } } ``` ## Batch模式写示例 > cypher提供的`unwind`关键字支持批量写入, > 批量数据的默认变量是batch。如果你写一个批处理写语句, > 那么你应该声明 cypher `unwind $batch` 作为行 ``` sink { Neo4j { uri = "bolt://localhost:7687" username = "neo4j" password = "neo4j" database = "neo4j" max_batch_size = 1000 write_mode = "BATCH" max_transaction_retry_time = 3 max_connection_timeout = 10 query = "unwind $batch as row create(n:MyLabel) set n.name = row.name,n.age = row.age" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/ObsFile.md ================================================ import ChangeLog from '../changelog/connector-file-obs.md'; # ObsFile > Obs file sink 连接器 ## 支持这些引擎 > Spark > > Flink > > Seatunnel Zeta ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保“精确一次”` - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 描述 将数据输出到华为云obs文件系统。 如果你使用spark/flink,为了使用这个连接器,你必须确保你的spark/flink集群已经集成了hadoop。测试的hadoop版本是2.x。 如果你使用SeaTunnel Engine,当你下载并安装SeaTunnel引擎时,它会自动集成hadoop jar。您可以在${SEATUNNEL_HOME}/lib下检查jar包以确认这一点。 为了支持更多的文件类型,我们进行了一些权衡,因此我们使用HDFS协议对OBS进行内部访问,而这个连接器需要一些hadoop依赖。 它只支持hadoop版本**2.9.X+**。 ## 所需Jar包列表 | jar | 支持的版本 | Maven下载链接 | |--------------------|-----------------------------|---------------------------------------------------------------------------------------------------| | hadoop-huaweicloud | support version >= 3.1.1.29 | [下载](https://repo.huaweicloud.com/artifactory/sdk_public/org/apache/hadoop/hadoop-huaweicloud/) | | esdk-obs-java | support version >= 3.19.7.3 | [下载](https://repo.huaweicloud.com/artifactory/sdk_public/com/huawei/storage/esdk-obs-java/) | | okhttp | support version >= 3.11.0 | [下载](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | | okio | support version >= 1.14.0 | [下载](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | >请下载“Maven”对应的支持列表,并将其复制到“$SEATUNNEL_HOME/plugins/jdbc/lib/”工作目录。 > >并将所有jar复制到$SEATUNNEL_HOME/lib/ ## 参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |----------------------------------|---------|------|--------------------------------------------|-------------------------------------------------------------------------| | path | string | 是 | - | 目标目录路径。 | | bucket | string | 是 | - | obs文件系统的bucket地址,例如:`obs://obs-bucket-name`. | | access_key | string | 是 | - | obs文件系统的访问密钥。 | | access_secret | string | 是 | - | obs文件系统的访问私钥。 | | endpoint | string | 是 | - | obs文件系统的终端。 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名。 | | file_name_expression | string | 否 | "${transactionId}" | 描述将在“路径”中创建的文件表达式。仅在custom_filename为true时使用。[提示](#file_name_expression) | | filename_time_format | string | 否 | "yyyy.MM.dd" | 指定“path”的时间格式。仅在custom_filename为true时使用。[提示](#filename_time_format) | | file_format_type | string | 否 | "csv" | 支持的文件类型。[提示](#file_format_type) | | field_delimiter | string | 否 | '\001' | 数据行中列之间的分隔符。仅在file_format为文本时使用。 | | row_delimiter | string | 否 | "\n" | 文件中行之间的分隔符。仅被 `text`、`csv`、`json` 文件格式需要。 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 根据所选字段对数据进行分区。只有在have_partition为true时才使用。 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 只有在have_partition为真true时才使用。[提示](#partition_dir_expression) | | is_partition_field_write_in_file | boolean | 否 | false | 只有在have_partition为true时才使用。[提示](#is_partition_field_write_in_file) | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是接收列。[提示](#sink_columns) | | is_enable_transaction | boolean | 否 | true | [提示](#is_enable_transaction) | | batch_size | int | 否 | 1000000 | [提示](#batch_size) | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。启用此参数后,batch_size将不会生效。输出文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | compress_codec | string | 否 | none | [提示](#compress_codec) | | common-options | object | 否 | - | [提示](#common_options) | | max_rows_in_memory | int | 否 | - | 当文件格式为Excel时,内存中可以缓存的最大数据项数。仅在file_format为excel时使用。 | | sheet_name | string | 否 | Sheet${Random number} | 标签页。仅在file_format为excel时使用。 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### 提示 #### file_name_expression >仅在“custom_filename”为“true”时使用。 > >`file_name_expression`描述了将在`path`中创建的文件表达式。 > >我们可以在“file_name_expression”中添加变量“${now}”或“${uuid}”,类似于“test_${uuid}_${now}”, > >“${now}”表示当前时间,其格式可以通过指定选项“filename_time_format”来定义。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 #### filename_time_format >仅在“custom_filename”为“true”时使用。 > >当`file_name_expression`参数中的格式为`xxxx-${now}`时,`filename_time_format`可以指定路径的时间格式,默认值为`yyyy.MM.dd`。常用的时间格式如下: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | #### file_format_type >我们支持以下文件类型: > > `text` `json` `csv` `orc` `parquet` `excel` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以file_format的后缀结尾,文本文件的后缀为“txt”。 #### partition_dir_expression >仅在“have_partition”为“true”时使用。 > >如果指定了`partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 > >默认的`partition_dir_expression`是`${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`.`k0`是第一个分区字段,`v0`是第一个划分字段的值。 #### is_partition_field_write_in_file >仅在“have_partition”为“true”时使用。 > >如果`is_partition_field_write_in_file`为`true`,则分区字段及其值将写入数据文件。 > >例如,如果你想写一个Hive数据文件,它的值应该是“false”。 #### sink_columns >哪些列需要写入文件,默认值是从“Transform”或“Source”获取的所有列。 >字段的顺序决定了文件实际写入的顺序。 #### is_enable_transaction >如果`is_enable_transaction`为`true`,我们将确保数据在写入目标目录时不会丢失或重复。 > >请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。现在只支持“true”。 #### batch_size >文件中的最大行数。对于SeaTunnel引擎,文件中的行数由“batch_size”和“checkpoint.interval”共同决定。如果“checkpoint.interval”的值足够大,sink writer将在文件中写入行,直到文件中的行大于“batch_size”。如果“checkpoint.interval”较小,则接收器写入程序将在新的检查点触发时创建一个新文件。 #### compress_codec >文件的压缩编解码器和支持的详细信息如下所示: > > - txt: `lzo` `none` > - json: `lzo` `none` > - csv: `lzo` `none` > - orc: `lzo` `snappy` `lz4` `zlib` `none` > - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 请注意,excel类型不支持任何压缩格式 #### merge_update_event >仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. >设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; >设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; #### common options >Sink插件常用参数,请参考[Sink common Options](../common-options/sink-common-options.md)了解详细信息。 ## 任务示例 ### text 文件 >对于具有“have_partition”、“custom_filename”和“sink_columns”的文本文件格式。 ```hocon ObsFile { path="/seatunnel/text" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` ### parquet 文件 >适用于带有“have_partition”和“sink_columns”的拼花地板文件格式。 ```hocon ObsFile { path = "/seatunnel/parquet" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "obs.xxxxxx.myhuaweicloud.com" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] } ``` ### orc 文件 >对于orc文件格式的简单配置。 ```hocon ObsFile { path="/seatunnel/orc" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "orc" } ``` ### json 文件 >对于json文件格式简单配置。 ```hcocn ObsFile { path = "/seatunnel/json" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "json" } ``` ### excel 文件 >对于excel文件格式简单配置。 ```hcocn ObsFile { path = "/seatunnel/excel" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "excel" } ``` ### csv 文件 >对于csv文件格式简单配置。 ```hcocn ObsFile { path = "/seatunnel/csv" bucket = "obs://obs-bucket-name" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "obs.xxxxx.myhuaweicloud.com" file_format_type = "csv" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/OceanBase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # OceanBase > JDBC OceanBase Sink 连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) ## 描述 通过jdbc写入数据。支持批处理模式和流模式,支持并发写入,支持精确一次语义。 ## 支持的数据源信息 | 数据源 | 支持版本 | Driver | Url | Maven | |------------|---------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| | OceanBase | 所有OceanBase服务版本 | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | ## 数据库相关依赖 > 请下载“Maven”对应的支持列表,并将其复制到“$SEATUNNEL_HOME/plugins/jdbc/lib/”工作目录
    > 例如: cp oceanbase-client-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 ### Mysql模式 | Mysql Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
    INT UNSIGNED | BOOLEAN | | TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(获取指定列的指定列大小<38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的指定列大小>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((获取指定列的指定列大小)+1,
    (获取指定列小数点右侧的位数。))) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | | GEOMETRY
    UNK否WN | 否t supported yet | ### Oracle 模式 | Oracle Data type | SeaTunnel Data type | |-----------------------------------------------------------|---------------------| | Number(p), p <= 9 | INT | | Number(p), p <= 18 | BIGINT | | Number(p), p > 18 | DECIMAL(38,18) | | REAL
    BINARY_FLOAT | FLOAT | | BINARY_DOUBLE | DOUBLE | | CHAR
    NCHAR
    NVARCHAR2
    NCLOB
    CLOB
    ROWID | STRING | | DATE | DATE | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | | UNK否WN | 否t supported yet | ## Sink 选项 | Name | Type | Required | Default | Description | |------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC连接的URL。参考案例: jdbc:oceanbase://localhost:2883/test | | driver | String | 是 | - | 用于连接到远程数据源的jdbc类名应为 `com.oceanbase.jdbc.Driver`. | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 否 | - | 使用此sql将上游输入数据写入数据库。例如“insert…”查询具有更高的优先级 | | compatible_mode | String | 是 | - | OceanBase的兼容模式可以是“mysql”或“oracle”。 | | database | String | 否 | - | 使用这个“database”和“table-name”自动生成sql并接收上游输入数据写入数据库
    此选项与“query”互斥,具有更高的优先级。 | | table | String | 否 | - | 使用数据库和此表名自动生成sql并接收上游输入数据写入数据库
    此选项与“query”互斥,并且具有更高的 priority. | | primary_keys | Array | 否 | - | 此选项用于在自动生成sql时支持“insert”、“delete”和“update”等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败的重试次数(executeBatch) | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录的数量达到“batch_size”的数量或时间达到“checkpoint.interval”
    时,数据将被刷新到数据库中 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成sql语句 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时,默认值为-1(永不超时)。请注意,设置超时可能会影响<br/>精确一次语义 | | auto_commit | Boolean | 否 | true | 默认情况下启用自动事务提交 | | properties | Map | 否 | - | 其他连接配置参数,当属性和URL具有相同的参数时,优先级由驱动程序的特定实现决定。例如,在MySQL中,属性优先于URL。 | | common-options | | 否 | - | Sink插件常用参数,详见[Sink common Options](../common-options/sink-common-options.md) | | enable_upsert | Boolean | 否 | true | 通过primary_keys存在启用upsert,如果任务没有键重复数据,将此参数设置为“false”可以加快数据导入 | ### 提示 > 如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发数并行执行。 ## 任务示例 ### 简单示例 > 此示例定义了一个SeaTunnel同步任务,该任务通过FakeSource自动生成数据并将其发送到JDBC Sink。FakeSource总共生成16行数据(row.num=16),每行有两个字段,name(字符串类型)和age(int类型)。最终的目标表是test_table,表中也将有16行数据。在运行此作业之前,您需要在mysql中创建数据库测试和表test_table。如果您尚未安装和部署SeaTunnel,则需要按照[安装SeaTunnel](../../getting-started/locally/deployment.md)中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 ``` # 定义运行环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件,**仅用于测试和演示功能源插件** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # 如果你想了解更多关于如何配置seatunnel的信息,并查看完整的source插件列表, # 请前往https://seatunnel.apache.org/docs/connectors/source } transform { # 如果你想了解更多关于如何配置seatunnel的信息,并查看transform插件的完整列表, # 请前往https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:oceanbase://localhost:2883/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" query = "insert into test_table(name,age) values(?,?)" } # 如果你想了解更多关于如何配置seatunnel的信息,并查看完整的sink插件列表, # 请前往https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成 Sink SQL > 此示例不需要编写复杂的sql语句,您可以配置数据库名称表名以自动为您生成add语句 ``` sink { jdbc { url = "jdbc:oceanbase://localhost:2883/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" # 根据数据库表名自动生成sql语句 generate_sink_sql = true database = test table = test_table } } ``` ### CDC(Change Data Capture) 数据变更事件 > 我们也支持CDC变更数据。在这种情况下,您需要配置数据库、表和主键。 ``` sink { jdbc { url = "jdbc:oceanbase://localhost:3306/test" driver = "com.oceanbase.jdbc.Driver" username = "root" password = "123456" compatible_mode = "mysql" generate_sink_sql = true # 您需要同时配置数据库和表 database = test table = sink_table primary_keys = ["id","name"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Oracle.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Oracle > JDBC Oracle Sink 连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过jdbc写入数据。支持批处理模式和流模式,支持并发写入,支持“精确一次” 语义(使用XA事务保证)。 ## 依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8)已经添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) 已经添加到目录 `${SEATUNNEL_HOME}/lib/`. ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) >使用“Xa事务”来确保“精确一次”。因此,数据库只支持“精确一次”,即 >支持“Xa事务”。您可以设置`is_exactly_once=true `来启用它。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动器 | 网址 | Maven下载链接 | |------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| | Oracle | 不同的依赖版本具有不同的驱动程序类。 | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## 数据库依赖关系 >请下载“Maven”对应的支持列表,并将其复制到“$SEATUNNEL_HOME/plugins/jdbc/lib/”工作目录
    >例如,Oracle数据源:cp ojdbc8-xxxx.jar$SEATUNNEL_HOME/lib/
    >要支持i18n字符集,请将orai18n.jar复制到$SEATUNNEL_HOME/lib/目录。 ## 数据类型映射 | Oracle 数据类型 | SeaTunnel 数据类型 | |--------------------------------------------------------------------------------------|---------------------| | INTEGER | INT | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
    REAL | FLOAT | | CHAR
    NCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    | STRING | | DATE | DATE | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | ## 参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |------------------------------|---------|----------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参见示例: jdbc:oracle:thin:@datasource01:1523:xe | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,
    如果使用 Oracle,值为 `oracle.jdbc.OracleDriver`。 | | username | String | 否 | - | 连接实例用户名。 | | password | String | 否 | - | 连接实例密码。 | | query | String | 否 | - | 使用此sql将上游输入数据写入数据库。例如: `INSERT ...`,`query` 具有更高的优先级 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成sql并接收上游输入数据写入数据库。
    此选项与`query` 互斥,具有更高的优先级 | | table | String | 否 | - | 使用数据库和此表名自动生成sql并接收上游输入数据写入数据库。
    此选项与`query` 互斥,具有更高的优先级 | | primary_keys | Array | 否 | - | 此选项用于支持以下操作,例如 `insert`, `delete`, 和 `update` 当自动生成sql. | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败的重试次数(executeBatch) | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录的数量达到“batch_size”的数量或时间达到“checkpoint.interval”
    时,数据将被刷新到数据库中。 | | batch_interval_ms | Int | 否 | 1000 | 对于批写入,当缓冲区的数量达到“batch_size”的数量或时间达到“batch-interval_ms”时,数据将被刷新到数据库中。 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,这将使用Xa事务。如果启用,则需要
    设置`xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成sql语句 | | xa_data_source_class_name | String | 否 | - | 数据库Driver的xa数据源类名,例如Oracle,是`Oracle.jdbc.xa.client。OracleXADataSource和
    请参阅附录了解其他数据源 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时,默认值为-1(永不超时)。请注意,设置超时可能会影响<br/>精确一次语义 | | auto_commit | Boolean | 否 | true | 默认情况下启用自动事务提交 | | properties | Map | 否 | - | 其他连接配置参数,当属性和URL具有相同的参数时,优先级由驱动程序的特定实现决定。例如,在MySQL中,属性优先于URL。 | | common-options | | 否 | - | Sink插件常用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在启动同步任务之前,对目标侧的现有表面结构选择不同的处理方案。 | | data_save_mode | Enum | 否 | APPEND_DATA | 在启动同步任务之前,对目标端的现有数据选择不同的处理方案。 | | custom_sql | String | 否 | - | 当data_save_mode选择CUSTOM_PROCESSING时,您应该填写CUSTOM_SQL参数。此参数通常填充可以执行的SQL。SQL将在同步任务之前执行。 | | enable_upsert | Boolean | 否 | true | 通过primary_keys存在启用upstart,如果任务只有“插入”,将此参数设置为“false”可以加快数据导入 | ### 提示 >如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发数并行执行。 ## 任务示例 ### 简单的例子 >此示例定义了一个SeaTunnel同步任务,该任务通过FakeSource自动生成数据并将其发送到JDBC Sink。FakeSource总共生成16行数据(row.num=16),每行有两个字段,name(字符串类型)和age(int类型)。最终的目标表是test_table,表中也将有16行数据。在运行此作业之前,您需要在Oracle中创建测试数据库和表test_table。如果您尚未安装和部署SeaTunnel,则需要按照[安装SeaTunnel](../../getting-started/locally/deployment.md)中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 ``` # 定义运行环境 env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } #如果你想了解更多关于如何配置seatunnel的信息,并查看完整的源插件列表, #请前往https://seatunnel.apache.org/docs/connectors/source } transform { #如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表, #请前往https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" } #如果你想了解更多关于如何配置seatunnel的信息,并查看完整的sink插件列表, #请前往https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成Sink SQL >此示例不需要编写复杂的sql语句,您可以配置数据库名称表名以自动为您生成add语句 ``` sink { Jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 generate_sink_sql = true database = XE table = "TEST.TEST_TABLE" } } ``` ### 精确一次 为了准确的写入场景,我们保证一次准确 ``` sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" max_retries = 0 username = root password = 123456 query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" is_exactly_once = "true" xa_data_source_class_name = "oracle.jdbc.xa.client.OracleXADataSource" } } ``` ### CDC(变更数据捕获)事件 >我们也支持CDC更改数据。在这种情况下,您需要配置数据库、表和主键。 ``` sink { jdbc { url = "jdbc:oracle:thin:@datasource01:1523:xe" driver = "oracle.jdbc.OracleDriver" username = root password = 123456 generate_sink_sql = true # You need to configure both database and table database = XE table = "TEST.TEST_TABLE" primary_keys = ["ID"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/OssFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss.md'; # OssFile > Oss 文件 sink 连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 使用依赖性 ### 适用于Spark/Flink引擎 1. 您必须确保您的spark/flink集群已经集成了hadoop。测试的hadoop版本是2.x。 2. 您必须确保`${SEATUNNEL_HOME}/plugins/`目录中的`hadoop-aliyun-xx.jar`, `aliyun-sdk-oss-xx.jar`和`jdom-xx.jar`的版本与您在spark/flink中使用的hadoop版本匹配,`aliyun-sdk-oss-x.x.jar`和`jdom-xx.jar`版本需要与`hadoop-aliyun`版本对应的版本。例如:`hadoop-aliyun-3.1.4.jar`依赖项`aliyun-sdk-oss-3.4.1.jar`和`jdom-1.1.jar`。 ### 适用于SeaTunnel Zeta引擎 1. 您必须确保在`${seatunnel_HOME}/lib/`目录中有`seatunnel-hadopp3-3.1.4-uber.jar `、`aliyun-sdk-oss-3.4.1.jar `、` hadoop-aliyun-3.1.4.jar`和`jdom-1.1.jar `。 ## 关键特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保`精确一次` - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 数据类型映射 如果写入`csv`、`text`文件类型,则所有列将为字符串。 ### Orc 文件类型 | SeaTunnel 数据类型 | Orc 数据类型 | |----------------------|-----------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | BYTE | | SMALLINT | SHORT | | INT | INT | | BIGINT | LONG | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
    TIMESTAMP | TIMESTAMP | | ROW | STRUCT | | NULL | 不支持的数据类型 | | ARRAY | LIST | | Map | Map | ### Parquet 文件类型 | SeaTunnel 数据类型 | Parquet 数据类型 | |----------------------|------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | INT_8 | | SMALLINT | INT_16 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
    TIMESTAMP | TIMESTAMP_MILLIS | | ROW | GroupType | | NULL | 不支持的数据类型 | | ARRAY | LIST | | Map | Map | ## 选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |---------------------------------------|---------|----|--------------------------------------------|-------------------------------------------------------------------| | path | string | 是 | 写入文件的oss路径。 | | | tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入tmp路径,然后使用`mv`将tmp-dir提交到目标dir。因此需要一个OSS目录。 | | bucket | string | 是 | - | | | access_key | string | 是 | - | | | access_secret | string | 是 | - | | | endpoint | string | 是 | - | | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在custom_filename为true时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在custom_filename为true时使用 | | file_format_type | string | 否 | "csv" | | | field_delimiter | string | 否 | '\001' | 仅当file_format_type为文本时使用 | | row_delimiter | string | 否 | "\n" | 仅当file_format_type为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 只有在have_partition为true时才使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 只有在have_partition为true时才使用 | | is_partition_field_write_in_file | boolean | 否 | false | 只有在have_partition为true时才使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是接收列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅当file_format_type为excel时使用。 | | sheet_name | string | 否 | Sheet${Random number} | 仅当file_format_type为excel时使用。 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在file_format为csv时使用。 | | xml_root_tag | string | 否 | RECORDS | 仅在file_format为xml时使用。 | | xml_row_tag | string | 否 | RECORD | 仅在file_format为xml时使用。 | | xml_use_attr_format | boolean | 否 | - | 仅在file_format为xml时使用。 | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。启用此参数后,batch_size将不会生效。输出文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在file_format为parquet时使用。 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在file_format为parquet时使用。 | | enable_header_write | boolean | 否 | false | 仅当file_format_type为文本、csv时使用
    false:不写标头,true:写标头。 | | encoding | string | 否 | "UTF-8" | 仅当file_format_type为json、text、csv、xml时使用。 | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在开启同步任务之前,对目标路径进行不同的处理 | | data_save_mode | Enum | 否 | APPEND_DATA | 在开启同步任务之前,对目标路径中的数据文件进行不同的处理 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### path [string] 目标目录路径是必需的。 ### bucket [string] oss文件系统的bucket地址,例如:`oss://tyrantlucifer-image-bed` ### access_key [string] oss文件系统的access_key。 ### access_secret [string] oss文件系统的access_secret。 ### endpoint [string] oss文件系统的endpoint端点。 ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅在`custom_filename`为`true`时使用 `file_name_expression描述了将在`path`中创建的文件表达式。我们可以在`file_name_expression`中添加变量`${now}`或`${uuid}`,类似于`test_${uuid}_${now}`,`${now}`表示当前时间,其格式可以通过指定选项`filename_time_format`来定义。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 ### filename_time_format [String] 仅在`custom_filename`为`true`时使用` 当`file_name_expression`参数中的格式为`xxxx-${Now}时,`filename_time_format`可以指定路径的时间格式,默认值为`yyyy.MM.dd。常用的时间格式如下: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以file_format_type的后缀结尾,文本文件的后缀为`txt`。 ### field_delimiter [string] 数据行中列之间的分隔符。只需要`文本`文件格式。 ### row_delimiter [string] 文件中行之间的分隔符。只需要 `text`、`csv`、`json` 文件格式。 ### have_partition [boolean] 是否需要处理分区。 ### partition_by [array] 仅当`have_partition`为`true`时使用。 根据所选字段对数据进行分区。 ### partition_dir_expression [string] 仅在`have_partition`为`true`时使用。 如果指定了`partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 默认的`partition_dir_expression`是`${k0}=${v0}/${k1}=${1v1}//${kn}=${vn}/``k0是第一个分区字段,v0是第一个划分字段的值。 ### is_partition_field_write_in_file [boolean] 仅在`have_partition`为`true`时使用。 如果`is_partition_field_write_in_file`为`true`,则分区字段及其值将写入数据文件。 例如,如果你想写一个Hive数据文件,它的值应该是`false`。 ### sink_columns [array] 哪些列需要写入文件,默认值是从`Transform`或`Source`获取的所有列。 字段的顺序决定了文件实际写入的顺序。 ### is_enable_transaction [boolean] 如果`is_enable_transaction`为true,我们将确保数据在写入目标目录时不会丢失或重复。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 现在只支持`true`。 ### batch_size [int] 文件中的最大行数。对于SeaTunnel引擎,文件中的行数由`batch_size`和`checkpoint.interval`共同决定。如果`checkpoint.interval`的值足够大,sink writer将在文件中写入行,直到文件中的行大于`batch_size`。如果`checkpoint.interval`较小,则接收器写入程序将在新的检查点触发时创建一个新文件。 ### compress_codec [string] 文件的压缩编解码器和支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 提示:excel类型不支持任何压缩格式 ### 通用选项 Sink插件常用参数,请参考[Sink common Options](../Sink common Options.md)了解详细信息。 ### max_rows_in_memory [int] 当文件格式为Excel时,内存中可以缓存的最大数据项数。 ### sheet_name [string] 编写工作簿的工作表 ### csv_string_quote_mode [string] 当文件格式为CSV时,CSV的字符串引用模式。 - ALL: 所有字符串字段都将被引用。 - MINIMAL: 引号字段包含特殊字符,如字段分隔符、引号字符或行分隔符字符串中的任何字符。 - NONE: 从不引用字段。当分隔符出现在数据中时,打印机会用转义符作为前缀。如果未设置转义符,格式验证将抛出异常。 ### xml_root_tag [string] 指定XML文件中根元素的标记名。 ### xml_row_tag [string] 指定XML文件中数据行的标记名称。 ### xml_use_attr_format [boolean] 指定是否使用标记属性格式处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入Parquet INT96,仅适用于拼花地板文件。 ### parquet_avro_write_fixed_as_int96 [array] 支持从12-byte字段写入Parquet INT96,仅适用于拼花地板文件。 ### encoding [string] 仅当file_format_type为json、text、csv、xml时使用。 要写入的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### schema_save_mode [Enum] 在开启同步任务之前,对目标路径进行不同的处理。 选项介绍: `RECREATE_SCHEMA` :当路径不存在时创建。如果路径已存在,则删除路径并重新创建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :当路径不存在时创建,路径存在时使用路径。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :当路径不存在时报错 `IGNORE` :忽略表的处理 ### data_save_mode [Enum] 在开启同步任务之前,对目标路径中的数据文件进行不同的处理。 选项介绍: `DROP_DATA`:使用路径但删除路径中的数据文件。 `APPEND_DATA`:使用路径,并在路径中添加新文件以写入数据。 `ERROR_WHEN_DATA_EXISTS`:当路径中存在数据文件时,将报错。 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 如何创建Oss数据同步作业 以下示例演示了如何创建从假数据源读取数据并写入的数据同步作业 把它发送到Oss: 对于具有`have_partition`、`custom_filename`和`sink_columns`的文本文件格式 ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建产品数据源 source { FakeSource { schema = { fields { name = string age = int } } } } # 将数据写入Oss sink { OssFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` 适用于带有`have_partition`和`sink_columns`的parquet文件格式 ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # Create a source to product data source { FakeSource { schema = { fields { name = string age = int } } } } # 将数据写入Oss sink { OssFile { path = "/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_format_type = "parquet" sink_columns = ["name","age"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` 对于orc文件格式的简单配置 ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # Create a source to product data source { FakeSource { schema = { fields { name = string age = int } } } } # 将数据写入Oss sink { OssFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### enable_header_write [boolean] 仅当file_format_type为`text` `csv`时使用。false:不写标头,true:写标头。 ### 多表 用于从上游提取source元数据, 您可以在路径中使用`${database_name}`, `${table_name}` 和 `${schema_name}`。 ```bash env { parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { FakeSource { tables_configs = [ { schema = { table = "fake1" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } }, { schema = { table = "fake2" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } ] } } sink { OssFile { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/tmp/fake_empty/text/${table_name}" row_delimiter = "\n" partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" file_format_type = "text" filename_time_format = "yyyy.MM.dd" is_enable_transaction = true compress_codec = "lzo" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### 提示 > 1.[SeaTunnel部署方案](../../getting-started/locally/deployment.md). ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/OssJindoFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss-jindo.md'; # OssJindoFile > OssJindo file sink 连接器 ## 描述 使用jindo-api将数据输出到oss文件系统。 :::提示 您需要下载[jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) 然后解压缩,将jindo-sdk-4.6.1.jar和jindo-core-4.6.1.jar从lib复制到${SEATUNNEL_HOME}/lib。 如果你使用spark/flink,为了使用这个连接器,你必须确保你的spark/flink集群已经集成了hadoop。测试的hadoop版本是2.x。 如果你使用SeaTunnel引擎,当你下载并安装SeaTunnel引擎时,它会自动集成hadoop jar。您可以在${SEATUNNEL_HOME}/lib下检查jar包以确认这一点。 为了支持更多的文件类型,我们进行了一些权衡,因此我们使用HDFS协议对OSS进行内部访问,而这个连接器需要一些hadoop依赖。它只支持hadoop版本**2.9.X+**。 ::: ## 关键特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保“精确一次” - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |---------------------------------------|---------|----|--------------------------------------------|-----------------------------------------------------------| | path | string | 是 | - | | | tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用`mv`将tmp-dir提交到目标目录。需要一个OSS 目录。 | | bucket | string | 是 | - | | | access_key | string | 是 | - | | | access_secret | string | 是 | - | | | endpoint | string | 是 | - | | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在custom_filename为true时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在custom_filename为true时使用 | | file_format_type | string | 否 | "csv" | | | field_delimiter | string | 否 | '\001' | 仅当file_format_type为text时使用 | | row_delimiter | string | 否 | "\n" | 仅当file_format_type为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 只有在have_partition为true时才使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 只有在have_partition为true时才使用 | | is_partition_field_write_in_file | boolean | 否 | false | 只有在have_partition为true时才使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是Sink列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅当file_format_type为excel时使用。 | | sheet_name | string | 否 | Sheet${Random number} | 仅当file_format_type为excel时使用。 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅在file_format为csv时使用。 | | xml_root_tag | string | 否 | RECORDS | 仅在file_format为xml时使用。 | | xml_row_tag | string | 否 | RECORD | 仅在file_format为xml时使用。 | | xml_use_attr_format | boolean | 否 | - | 仅在file_format为xml时使用。 | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。启用此参数后,batch_size将不会生效。输出文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅在file_format为parquet时使用。 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅在file_format为parquet时使用。 | | encoding | string | 否 | "UTF-8" | 仅当file_format_type为json、text、csv、xml时使用。 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### path [string] 目标目录路径是必需的。 ### bucket [string] oss文件系统的bucket地址,例如:`oss://tyrantlucifer-image-bed` ### access_key [string] oss文件系统access_key ### access_secret [string] oss文件系统的access_secret ### endpoint [string] oss文件系统的端点。 ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅在“custom_filename”为“true”时使用 `file_name_expression描述了将在`path`中创建的文件表达式。我们可以在“file_name_expression”中添加变量“${now}”或“${uuid}”,类似于“test”_${uuid}_${now}`,`${now}`表示当前时间,其格式可以通过指定选项`filename_time_format`来定义。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 ### filename_time_format [string] 仅在“custom_filename”为“true”时使用 当`file_name_expression`参数中的格式为`xxxx-${now}时,`filename_time_format`可以指定路径的时间格式,默认值为`yyyy.MM.dd。常用的时间格式如下: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以file_format_type的后缀结尾,文本文件的后缀为“txt”。 ### field_delimiter [string] 数据行中列之间的分隔符。只需要“text”文件格式。 ### row_delimiter [string] 文件中行之间的分隔符。只需要 `text`、`csv`、`json` 文件格式。 ### have_partition [boolean] 是否需要处理分区。 ### partition_by [array] 仅在“have_partition”为“true”时使用。 根据所选字段对数据进行分区。 ### partition_dir_expression [string] 仅在“have_partition”为“true”时使用。 如果指定了`partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 默认的`partition_dir_expression`是`${k0}=${v0}/${k1}=${1v1}//${kn}=${vn}/``k0是第一个分区字段,v0是第一个划分字段的值。 ### is_partition_field_write_in_file [boolean] 仅在“have_partition”为“true”时使用。 如果`is_partition_field_write_in_file`为`true`,则分区字段及其值将写入数据文件。 例如,如果你想写一个Hive数据文件,它的值应该是“false”。 ### sink_columns [array] 哪些列需要写入文件,默认值是从“Transform”或“Source”获取的所有列。 字段的顺序决定了文件实际写入的顺序。 ### is_enable_transaction [boolean] 如果`is_enable_transaction`为true,我们将确保数据在写入目标目录时不会丢失或重复。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 现在只支持“true”。 ### batch_size [int] 文件中的最大行数。对于SeaTunnel引擎,文件中的行数由“batch_size”和“checkpoint.interval”共同决定。如果“checkpoint.interval”的值足够大,sink writer将在文件中写入行,直到文件中的行大于“batch_size”。如果“checkpoint.interval”较小,则接收器写入程序将在新的检查点触发时创建一个新文件。 ### compress_codec [string] 文件的压缩编解码器和支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 提示:excel类型不支持任何压缩格式 ### common options Sink插件常用参数,请参考[Sink common Options](../common-options/sink-common-options.md)了解详细信息。 ### max_rows_in_memory [int] 当文件格式为Excel时,内存中可以缓存的最大数据项数。 ### sheet_name [string] 编写工作簿的工作表 ### csv_string_quote_mode [string] 当文件格式为CSV时,CSV的字符串引用模式。 - ALL: 所有字符串字段都将被引用。 - MINIMAL: 引号字段包含特殊字符,如字段分隔符、引号字符或行分隔符字符串中的任何字符。 - NONE: Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the escape character is not set, format validation throws an exception. 从不引用字段。当分隔符出现在数据中时,打印会用转义符作为前缀。如果未设置转义符,格式验证将抛出异常。 ### xml_root_tag [string] 指定XML文件中根元素的标记名。 ### xml_row_tag [string] 指定XML文件中数据行的标记名称。 ### xml_use_attr_format [boolean] 指定是否使用标记属性格式处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入Parquet INT96,仅适用于parquet文件。 ### parquet_avro_write_fixed_as_int96 [array] 支持从12字节字段写入Parquet INT96,仅适用于拼花地板文件。 ### encoding [string] 仅当file_format_type为json、text、csv、xml时使用。 要写入的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 例子 适用于具有“have_partition”、“custom_filename”和“sink_columns”的文本文件格式 ```hocon OssJindoFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` 适用于带有`sink_columns的parquet文件格式 ```hocon OssJindoFile { path = "/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "parquet" sink_columns = ["name","age"] } ``` 对于orc文件格式的简单配置 ```bash OssJindoFile { path="/seatunnel/sink" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxx" access_secret = "xxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Paimon.md ================================================ import ChangeLog from '../changelog/connector-paimon.md'; # Paimon > Paimon 数据连接器 ## 描述 Apache Paimon数据连接器。支持cdc写以及自动建表。 ### SeaTunnel与Paimon版本对照 | Seatunnel Version | Paimon Version | |-------------------|------------------| | 2.3.2 - 2.3.3 | 0.4-SNAPSHOT | | 2.3.4 | 0.6-SNAPSHOT | | 2.3.5 - 2.3.11 | 0.7.0-incubating | | 2.3.12 - 2.3.13 | 1.1.1 | ### 从 0.7 版本升级到 1.1.1 版本的注意事项 1. **备份建议** 尽管存在兼容性保障,但在从 0.7 版本开始升级前,仍强烈建议备份关键数据,尤其是元数据目录。 2. **逐步升级流程** - **测试环境验证**:首先在测试环境中验证(从 0.7 版本开始的)升级过程。 - **更新 JAR 文件**:将 Paimon 的 JAR 文件替换为 1.1.1 版本。 - **自动格式升级**:系统会自动识别并升级 0.7 版本中使用的文件格式。 3. **配置检查** 检查配置以确认是否存在 0.7 版本适用的已弃用选项。尽管大多数配置保持向后兼容,但已弃用的设置可能需要更新以适配 1.1.1 版本。 4. **升级后验证** 从 0.7 版本升级到 1.1.1 版本后,需验证以下内容: - **读写操作**:确保基于 0.7 版本继承的数据结构,数据写入和读取流程正常运行。 - **查询性能**:考虑到 0.7 与 1.1.1 版本间底层机制(如分桶管理)的变化,确认查询响应时间符合预期。 - **新功能验证**:测试所有新增功能(如增强的压实机制、时间旅行等),确保其与从 0.7 版本迁移的数据兼容并正常工作。 **注意**:遵循这些步骤有助于降低风险,确保从 0.7 版本平稳过渡到稳定版本 1.1.1。 ## 支持的数据源信息 | 数据源 | 依赖 | Maven | |--------|-----------|---------------------------------------------------------------------------| | Paimon | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Paimon | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## 数据源依赖 > 为了兼容不同版本的Hadoop和Hive,在项目pom文件中Hive -exec的作用域为provided,所以如果您使用Flink引擎,首先可能需要将以下Jar包添加到/lib目录下,如果您使用Spark引擎并与Hadoop集成,则不需要添加以下Jar包。 ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > 有些版本的hive-exec包没有libfb303-xxx.jar,所以您还需要手动导入Jar包。 ## 主要特性 - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## 连接器选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |------------------------------|------|------|------------------------------|------------------------------------------------------------------------------------------------------| | warehouse | 字符串 | 是 | - | Paimon warehouse路径 | | catalog_type | 字符串 | 否 | filesystem | Paimon的catalog类型,目前支持filesystem和hive | | catalog_uri | 字符串 | 否 | - | Paimon catalog的uri,仅当catalog_type为hive时需要配置 | | database | 字符串 | 是 | - | 数据库名称 | | table | 字符串 | 是 | - | 表名 | | user | 字符串 | 否 | - | paimon开启权限后,用户名 | | password | 字符串 | 否 | - | paimon开启权限后,用户名对应密码 | | hdfs_site_path | 字符串 | 否 | - | hdfs-site.xml文件路径 | | schema_save_mode | 枚举 | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | Schema保存模式 | | data_save_mode | 枚举 | 否 | APPEND_DATA | 数据保存模式 | | paimon.table.primary-keys | 字符串 | 否 | - | 主键字段列表,联合主键使用逗号分隔(注意:分区字段需要包含在主键字段中) | | paimon.table.partition-keys | 字符串 | 否 | - | 分区字段列表,多字段使用逗号分隔 | | paimon.table.write-props | Map | 否 | - | Paimon表初始化指定的属性, [参考](https://paimon.apache.org/docs/master/maintenance/configurations/#coreoptions) | | paimon.hadoop.conf | Map | 否 | - | Hadoop配置文件属性信息 | | paimon.hadoop.conf-path | 字符串 | 否 | - | Hadoop配置文件目录,用于加载'core-site.xml', 'hdfs-site.xml', 'hive-site.xml'文件配置 | | paimon.table.non-primary-key | Boolean | false | - | 控制创建主键表或者非主键表. 当为true时,创建非主键表, 为false时,创建主键表 | | branch | 字符串 | 否 | main | 要写入数据的Paimon表分支名称。如果指定的分支不存在,将抛出异常。 | ## 批模式下的checkpoint 当您在批处理模式下将`checkpoint.interval`设置为大于0的值时,在写入一定数量的记录后checkpoint触发时,paimon连接器将把数据提交到paimon表。此时,写入的数据是可见的。 但是,如果您没有在批处理模式下设置`checkpoint.interval`,则在写入所有记录之后,paimon sink连接器将提交数据。到批任务完成之前,写入的数据都是不可见的。 ## 更新日志 你必须配置`changelog-producer=input`来启用paimon表的changelog产生模式。如果你使用了paimon sink的自动建表功能,你可以在`paimon.table.write-props`中指定这个属性。 Paimon表的changelog产生模式有[四种](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/),分别是`none`、`input`、`lookup` 和 `full-compaction`。 目前支持全部`changelog-producer`模式。默认是`none`模式。 * [`none`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#none) * [`input`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#input) * [`lookup`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#lookup) * [`full-compaction`](https://paimon.apache.org/docs/master/primary-key-table/changelog-producer/#full-compaction) > 注意: > 当你使用流模式去读paimon表的数据时,不同模式将会产生[不同的结果](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/Paimon.md#changelog)。 ## 文件系统 Paimon连接器支持向多文件系统写入数据。目前支持的文件系统有hdfs和s3。 如果您使用s3文件系统。您可以配置`fs.s3a.access-key `, `fs.s3a.secret-key`, `fs.s3a.endpoint`, `fs.s3a.path.style.access`, `fs.s3a.aws.credentials`。在`paimon.hadoop.conf`选项中设置提供程序的属性。 除此之外,warehouse应该以`s3a://`开头。 ## 模式演变 Cdc采集支持有限数量的模式更改。目前支持的模式更改包括: * 添加列。 * 修改列。更具体地说,如果修改列类型,则支持以下更改: * 将字符串类型(char、varchar、text)更改为另一种长度更长的字符串类型, * 将二进制类型(binary, varbinary, blob)更改为另一种长度更长的二进制类型, * 将整数类型(tinyint, smallint, int, bigint)更改为另一种范围更大的整数类型, * 将浮点类型(float、double)更改为另一种范围更大的浮点类型, > 注意: > > 如果{oldType}和{newType}属于同一个类型族,但旧类型的精度高于新类型。忽略这个转换。 * 删除列。 * 更改列。 ## 示例 ### 模式演变 ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Paimon { warehouse = "file:///tmp/paimon" database = "mysql_to_paimon" table = "products" } } ``` ### 单表 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" } } ``` ### 单表(基于S3文件系统) ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } sink { Paimon { warehouse = "s3a://test/" database = "seatunnel_namespace11" table = "st_test" paimon.hadoop.conf = { fs.s3a.access-key=G52pnxg67819khOZ9ezX fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF fs.s3a.endpoint="http://minio4:9000" fs.s3a.path.style.access=true fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider } } } ``` ### 单表(指定hadoop HA配置和kerberos配置) ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" security.kerberos.login.principal = "your-kerberos-principal" security.kerberos.login.keytab = "your-kerberos-keytab-path" } } } ``` ### 单表(指定hadoop HA配置和指定hadoop用户名) ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.hadoop.conf = { hadoop_user_name = "hdfs" fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" security.kerberos.login.principal = "your-kerberos-principal" security.kerberos.login.keytab = "your-kerberos-keytab-path" } } } ``` ### 单表(使用Hive catalog) ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] }, { kind = INSERT fields = [3, "C", 100] } { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100] }, { kind = DELETE fields = [2, "B", 100] } ] } } sink { Paimon { schema_save_mode = "RECREATE_SCHEMA" catalog_name="seatunnel_test" catalog_type="hive" catalog_uri="thrift://hadoop04:9083" warehouse="hdfs:///tmp/seatunnel" database="seatunnel_test" table="st_test3" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ### 指定paimon的写属性的单表 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.table.write-props = { bucket = 2 file.format = "parquet" } paimon.table.partition-keys = "dt" paimon.table.primary-keys = "pk_id,dt" } } ``` #### 使用`changelog-producer`属性写入 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name = "seatunnel_test" warehouse = "file:///tmp/seatunnel/paimon/hadoop-sink/" database = "seatunnel" table = "role" paimon.table.write-props = { changelog-producer = full-compaction changelog-tmp-path = /tmp/paimon/changelog } } } ``` ### 动态分桶paimon单表 只有在主键表并指定bucket = -1时才会生效 > 注意: > - 目前只支持普通动态桶模式(主键包含所以分区字段)。 > - 在集群环境下运行时`parallelism`必须为`1`, 否则可能存在数据重复问题。 #### 核心参数:[参考官网](https://paimon.apache.org/docs/master/primary-key-table/data-distribution/#dynamic-bucket) | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |--------------------------------|------|------|----------|------------------| | dynamic-bucket.target-row-num | long | 是 | 2000000L | 控制一个bucket的写入的行数 | | dynamic-bucket.initial-buckets | int | 否 | | 控制初始化桶的数量 | ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role"] } } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="seatunnel" table="role" paimon.table.write-props = { bucket = -1 dynamic-bucket.target-row-num = 50000 } paimon.table.partition-keys = "dt" paimon.table.primary-keys = "pk_id,dt" } } ``` ### 多表 #### 示例1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="${database_name}" table="${table_name}" } } ``` #### 示例2 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@localhost:1521/XE" user = testUser password = testPassword table_list = [ { table_path = "TESTSCHEMA.TABLE_1" }, { table_path = "TESTSCHEMA.TABLE_2" } ] } } transform { } sink { Paimon { catalog_name="seatunnel_test" warehouse="file:///tmp/seatunnel/paimon/hadoop-sink/" database="${schema_name}_test" table="${table_name}_test" } } ``` ### paimon开启权限认证 #### 示例1 ```hocon env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { Mysql-CDC { url = "jdbc:mysql://127.0.0.1:3306/seatunnel" username = "root" password = "******" table-names = ["seatunnel.role","seatunnel.user","galileo.Bucket"] } } transform { } sink { Paimon { catalog_name = "seatunnel_test" warehouse = "file:///tmp/seatunnel/paimon/hadoop-sink/" database = "${database_name}" table = "${table_name}" user = "paimon" password = "******" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Phoenix.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Phoenix > Phoenix 数据接收器 ## 描述 该接收器是通过 [Jdbc数据连接器](Jdbc.md)来写Phoenix数据,支持批和流两种模式。测试的Phoenix版本为4.xx和5.xx。 在底层实现上,通过Phoenix的jdbc驱动,执行upsert语句向HBase写入数据。 使用Java JDBC连接Phoenix有两种方式:其一是使用JDBC连接zookeeper,其二是通过JDBC瘦客户端连接查询服务器。 > 提示1: 该接收器默认使用的是(thin)驱动jar包。如果需要使用(thick)驱动或者其他版本的Phoenix(thin)驱动,需要重新编译jdbc数据接收器模块。 > > 提示2: 该接收器还不支持精准一次语义(因为Phoenix还不支持XA事务)。 ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) ## 接收器选项 ### driver [string] phoenix(thick)驱动:`org.apache.phoenix.jdbc.PhoenixDriver` phoenix(thin)驱动:`org.apache.phoenix.queryserver.client.Driver` ### url [string] phoenix(thick)驱动:`jdbc:phoenix:localhost:2182/hbase` phoenix(thin)驱动:`jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` ### common options Sink插件常用参数,请参考[Sink常用选项](../common-options/sink-common-options.md)获取更多细节信息。 ## 示例 thick驱动: ``` Jdbc { driver = org.apache.phoenix.jdbc.PhoenixDriver url = "jdbc:phoenix:localhost:2182/hbase" query = "upsert into test.sink(age, name) values(?, ?)" } ``` thin驱动: ``` Jdbc { driver = org.apache.phoenix.queryserver.client.Driver url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" query = "upsert into test.sink(age, name) values(?, ?)" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/PostgreSql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # PostgreSql > JDBC PostgreSql 数据接收器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过 JDBC 写入数据。支持批处理模式和流式模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [变更数据捕获(CDC)](../../introduction/concepts/connector-v2-features.md) > 使用 `XA 事务` 来确保 `精确一次`。因此,仅对支持 `XA 事务` 的数据库支持 `精确一次`。您可以设置 `is_exactly_once=true` 来启用此功能。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | URL | Maven | |--------------|-----------------------------------------------------|----------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | 如果您想在 PostgreSQL 中处理 GEOMETRY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## 数据库依赖 > 请下载与 'Maven' 对应的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录中。
    > 例如 PostgreSQL 数据源:`cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/`
    > 如果您想在 PostgreSQL 中处理 GEOMETRY 类型,请将 `postgresql-xxx.jar` 和 `postgis-jdbc-xxx.jar` 添加到 `$SEATUNNEL_HOME/plugins/jdbc/lib/` 中。 ## 数据类型映射 | PostgreSQL 数据类型 | SeaTunnel 数据类型 | |--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | _BOOL
    | ARRAY<BOOLEAN> | | BYTEA
    | BYTES | | _BYTEA
    | ARRAY<TINYINT> | | INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | | _INT2
    _INT4
    | ARRAY<INT> | | INT8
    BIGSERIAL
    | BIGINT | | _INT8
    | ARRAY<BIGINT> | | FLOAT4
    | FLOAT | | _FLOAT4
    | ARRAY<FLOAT> | | FLOAT8
    | DOUBLE | | _FLOAT8
    | ARRAY<DOUBLE> | | NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小,获取指定列小数点右侧的数字位数) | | NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB
    UUID | STRING | | _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | | TIMESTAMP
    | TIMESTAMP | | TIME
    | TIME | | DATE
    | DATE | | 其他数据类型 | 目前不支持 | ## 选项 | 名称 | 类型 | 必填 | 默认 | 描述 | |------------------------------|---------|------|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参见示例:jdbc:postgresql://localhost:5432/test
    如果您使用 json 或 jsonb 类型插入,请添加 jdbc url 字符串 `stringtype=unspecified` 选项。 | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,
    如果使用 PostgreSQL,则该值为 `org.postgresql.Driver`。 | | username | String | 否 | - | 连接实例的用户名。 | | password | String | 否 | - | 连接实例的密码。 | | query | String | 否 | - | 使用此 SQL 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 的优先级更高。 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成 SQL,并接收上游输入数据写入数据库。
    此选项与 `query` 互斥,并具有更高的优先级。 | | table | String | 否 | - | 使用数据库和此表名自动生成 SQL,并接收上游输入数据写入数据库。
    此选项与 `query` 互斥,并具有更高的优先级。表参数可以填写一个不想的表的名称,最终将作为创建表的表名,并支持变量(`${table_name}`,`${schema_name}`)。替换规则: `${schema_name}` 将替换为传递给目标端的 SCHEMA 名称,`${table_name}` 将替换为传递给目标端的表名称。 | | primary_keys | Array | 否 | - | 此选项用于支持在自动生成 SQL 时进行 `insert`,`delete` 和 `update` 操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接的数据库操作完成的等待时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败的重试次数(executeBatch)。 | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲记录的数量达到 `batch_size` 或时间达到 `checkpoint.interval`
    时,数据将刷新到数据库。 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,将使用 XA 事务。如果启用,您需要
    设置 `xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成 SQL 语句。 | | xa_data_source_class_name | String | 否 | - | 数据库驱动的 XA 数据源类名,例如,PostgreSQL 是 `org.postgresql.xa.PGXADataSource`,并
    请参阅附录以获取其他数据源。 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数。 | | transaction_timeout_sec | Int | 否 | -1 | 事务开启后的超时时间,默认值为 -1(永不超时)。注意设置超时可能会影响
    精确一次语义。 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交。 | | field_ide | String | 否 | - | 识别字段在从源到 Sink 的同步时是否需要转换。`ORIGINAL` 表示无需转换;`UPPERCASE` 表示转换为大写;`LOWERCASE` 表示转换为小写。 | | properties | Map | 否 | - | 附加连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
    驱动的具体实现决定。例如,在 MySQL 中,properties 优先于 URL。 | | common-options | | 否 | - | Sink 插件的公共参数,请参阅 [Sink 公共选项](../common-options/sink-common-options.md) 以获取详细信息。 | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在同步任务开启之前,根据目标端现有表结构选择不同处理方案。 | | data_save_mode | Enum | 否 | APPEND_DATA | 在同步任务开启之前,根据目标端现有数据选择不同处理方案。 | | custom_sql | String | 否 | - | 当 `data_save_mode` 选择 `CUSTOM_PROCESSING` 时,您应该填写 `CUSTOM_SQL` 参数。此参数通常填入可执行的 SQL。SQL 将在同步任务之前执行。 | | enable_upsert | Boolean | 否 | true | 通过主键存在启用 upsert,如果任务没有重复数据,设置此参数为 `false` 可以加快数据导入。 | ### table [字符串] 使用 `database` 和此 `table-name` 自动生成 SQL,并接收上游输入数据写入数据库。 此选项与 `query` 互斥,并具有更高的优先级。 表参数可以填写一个不想的表的名称,最终将作为创建表的表名,并支持变量(`${table_name}`,`${schema_name}`)。替换规则:`${schema_name}` 将替换为传递给目标端的 SCHEMA 名称,`${table_name}` 将替换为传递给目标端的表名称。 例如: 1. `${schema_name}.${table_name}_test` 2. `dbo.tt_${table_name}_sink` 3. `public.sink_table` ### schema_save_mode [枚举] 在同步任务开启之前,根据目标端现有表结构选择不同处理方案。 选项介绍: `RECREATE_SCHEMA` :当表不存在时将创建,保存时删除并重建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :当表不存在时创建,保存时跳过。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :当表不存在时报告错误。 `IGNORE` :忽略对表的处理。 ### data_save_mode [枚举] 在同步任务开启之前,根据目标端现有数据选择不同处理方案。 选项介绍: `DROP_DATA`:保留数据库结构并删除数据。 `APPEND_DATA`:保留数据库结构,保留数据。 `CUSTOM_PROCESSING`:用户定义处理。 `ERROR_WHEN_DATA_EXISTS`:当存在数据时报告错误。 ### custom_sql [字符串] 当 `data_save_mode` 选择 `CUSTOM_PROCESSING` 时,您应该填写 `CUSTOM_SQL` 参数。此参数通常填入可以执行的 SQL。SQL 将在同步任务之前执行。 ### 提示 > 如果未设置 `partition_column`,它将以单线程并发运行;如果设置了 `partition_column`,它将根据任务的并发性并行执行。 ## 任务示例 ### 简单示例 > 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 JDBC Sink。FakeSource 生成总共 16 行数据(`row.num=16`),每行有两个字段,`name`(字符串类型)和 `age`(整数类型)。最终目标表 `test_table` 也将包含 16 行数据。在运行此作业之前,您需要在 PostgreSQL 中创建数据库 `test` 和表 `test_table`。如果您还未安装和部署 SeaTunnel,请按照 [安装 SeaTunnel](../../getting-started/locally/deployment.md) 中的说明进行安装和部署。然后按照 [快速开始 SeaTunnel 引擎](../../getting-started/locally/quick-start-seatunnel-engine.md) 中的说明运行此作业。 ``` # Defining the runtime environment env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # If you would like to get more information about how to configure seatunnel and see full list of source plugins, # please go to https://seatunnel.apache.org/docs/connectors/source } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms } sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = root password = 123456 query = "insert into test_table(name,age) values(?,?)" } # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, # please go to https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成 Sink SQL > 此示例不需要编写复杂的 SQL 语句,您可以配置数据库名称和表名称,系统将自动为您生成添加语句。 ``` sink { Jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = org.postgresql.Driver username = root password = 123456 generate_sink_sql = true database = test table = "public.test_table" } } ``` ### 精确一次 > 对于精确写入场景,我们保证精确一次。 ``` sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" max_retries = 0 username = root password = 123456 query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" } } ``` ### CDC(变更数据捕获)事件 > 我们也支持 CDC 变更数据。在这种情况下,您需要配置数据库、表和主键。 ``` sink { jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = root password = 123456 generate_sink_sql = true # You need to configure both database and table database = test table = sink_table primary_keys = ["id","name"] field_ide = UPPERCASE } } ``` ### 保存模式功能 ``` sink { Jdbc { # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option url = "jdbc:postgresql://localhost:5432/test" driver = org.postgresql.Driver username = root password = 123456 generate_sink_sql = true database = test table = "public.test_table" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Prometheus.md ================================================ import ChangeLog from '../changelog/connector-prometheus.md'; # Prometheus > Prometheus 数据接收器 ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [support multiple table write](../../introduction/concepts/connector-v2-features.md) ## 描述 接收Source端传入的数据,利用数据触发 web hooks。 > 例如,来自上游的数据为 [`label: {"__name__": "test1"}, value: 1.2.3,time:2024-08-15T17:00:00`], 则body内容如下: `{"label":{"__name__": "test1"}, "value":"1.23","time":"2024-08-15T17:00:00"}` **Tips: Prometheus 数据接收器 仅支持 `post json` 类型的 web hook,source 数据将被视为 webhook 中的 body 内容。并且不支持传递过去太久的数据** ## 支持的数据源信息 想使用 Prometheus 连接器,需要安装以下必要的依赖。可以通过运行 install-plugin.sh 脚本或者从 Maven 中央仓库下载这些依赖 | 数据源 | 支持版本 | 依赖 | |------|-----------|------------------------------------------------------------------------------------------------------------------| | Http | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-prometheus) | ## 接收器选项 | Name | Type | Required | Default | Description | |-----------------------------|--------|----------|---------|-------------------------------------------------------------------| | url | String | Yes | - | Http 请求链接 | | headers | Map | No | - | Http 标头 | | retry | Int | No | - | 如果请求http返回`IOException`的最大重试次数 | | retry_backoff_multiplier_ms | Int | No | 100 | http请求失败,重试回退次数(毫秒)乘数 | | retry_backoff_max_ms | Int | No | 10000 | http请求失败,最大重试回退时间(毫秒) | | connect_timeout_ms | Int | No | 12000 | 连接超时设置,默认12s | | socket_timeout_ms | Int | No | 60000 | 套接字超时设置,默认为60s | | key_timestamp | Int | NO | - | prometheus时间戳的key. | | key_label | String | yes | - | prometheus标签的key | | key_value | Double | yes | - | prometheus值的key | | batch_size | Int | false | 1024 | prometheus批量写入大小 | | flush_interval | Long | false | 300000L | prometheus定时写入 | | common-options | | No | - | Sink插件常用参数,请参考 [Sink常用选项 ](../common-options/sink-common-options.md) 了解详情 | ## 示例 简单示例: ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { schema = { fields { c_map = "map" c_double = double c_timestamp = timestamp } } plugin_output = "fake" rows = [ { kind = INSERT fields = [{"__name__": "test1"}, 1.23, "2024-08-15T17:00:00"] }, { kind = INSERT fields = [{"__name__": "test2"}, 1.23, "2024-08-15T17:00:00"] } ] } } sink { Prometheus { url = "http://prometheus:9090/api/v1/write" key_label = "c_map" key_value = "c_double" key_timestamp = "c_timestamp" batch_size = 1 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Pulsar.md ================================================ import ChangeLog from '../changelog/connector-pulsar.md'; # Pulsar > Pulsar 数据连接器 ## 引擎支持 > Spark
    > Flink
    > Seatunnel Zeta
    ## 核心特性 - [x] [精准一次](../../introduction/concepts/connector-v2-features.md) ## 描述 Apache Pulsar 的接收连接器。 ## 支持的数据源信息 | 数据源 | 支持的版本 | |--------|-----------| | Pulsar | Universal | ## 输出选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------------|--------|------|---------------------|-------------------------------------------------| | topic | String | Yes | - | 输出到Pulsar主题名称. | | client.service-url | String | Yes | - | Pulsar 服务的服务 URL 提供者. | | admin.service-url | String | Yes | - | 管理端点的 Pulsar 服务 HTTP URL. | | auth.plugin-class | String | No | - | 身份验证插件的名称. | | auth.params | String | No | - | 身份验证插件的参数. | | format | String | No | json | 数据格式。默认格式为 json。可选的文本格式. | | field_delimiter | String | No | , | 自定义数据格式的字段分隔符. | | semantics | Enum | No | AT_LEAST_ONCE | 写入 pulsar 的一致性语义. | | transaction_timeout | Int | No | 600 | 默认情况下,事务超时指定为 10 分钟. | | pulsar.config | Map | No | - | 除了上述必须由 Pulsar 生产者客户端指定的参数外. | | message.routing.mode | Enum | No | RoundRobinPartition | 要分区的消息的默认路由模式. | | partition_key_fields | array | No | - | 配置哪些字段用作 pulsar 消息的键. | | common-options | config | no | - | 源插件常用参数,详见源码 [常用选项](../common-options/sink-common-options.md). | ## 参数解释 ### client.service-url [String] Pulsar 服务的 Service URL 提供程序。要使用客户端库连接到 Pulsar, 您需要指定一个 Pulsar 协议 URL。您可以将 Pulsar 协议 URL 分配给特定集群并使用 Pulsar 方案。 例如, `localhost`: `pulsar://localhost:6650,localhost:6651`. ### admin.service-url [String] 管理端点的 Pulsar 服务 HTTP URL. 例如, `http://my-broker.example.com:8080`, or `https://my-broker.example.com:8443` for TLS. ### auth.plugin-class [String] 身份验证插件的名称。 ### auth.params [String] 身份验证插件的参数。 例如, `key1:val1,key2:val2` ### format [String] 数据格式。默认格式为 json。可选的文本格式。默认字段分隔符为","。如果自定义分隔符,请添加"field_delimiter"选项。 ### field_delimiter [String] 自定义数据格式的字段分隔符。默认field_delimiter为','。 ### semantics [Enum] 写入 pulsar 的一致性语义。可用选项包括 EXACTLY_ONCE、NON、AT_LEAST_ONCE、默认AT_LEAST_ONCE。 如果语义被指定为 EXACTLY_ONCE,我们将使用 2pc 来保证消息被准确地发送到 pulsar 一次。 如果语义指定为 NON,我们将直接将消息发送到 pulsar,如果作业重启/重试或网络错误,数据可能会重复/丢失。 ### transaction_timeout [Int] 默认情况下,事务超时指定为 10 分钟。如果事务未在指定的超时时间内提交,则事务将自动中止。因此,您需要确保超时大于检查点间隔。 ### pulsar.config [Map] 除了上述 Pulsar 生产者客户端必须指定的参数外,用户还可以为生产者客户端指定多个非强制性参数, 涵盖 Pulsar 官方文档中指定的所有生产者参数。 ### message.routing.mode [Enum] 要分区的消息的默认路由模式。可用选项包括 SinglePartition、RoundRobinPartition。 如果选择 SinglePartition,如果未提供密钥,分区生产者将随机选择一个分区并将所有消息发布到该分区中,如果消息上提供了密钥,则分区生产者将对密钥进行哈希处理并将消息分配给特定分区。 如果选择 RoundRobinPartition,则如果未提供密钥,则生产者将以循环方式跨所有分区发布消息,以实现最大吞吐量。请注意,轮询不是按单个消息完成的,而是设置为相同的批处理延迟边界,以确保批处理有效。 ### partition_key_fields [String] 配置哪些字段用作 pulsar 消息的键。 例如,如果要使用上游数据中的字段值作为键,则可以为此属性分配字段名称。 上游数据如下: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | 如果将 name 设置为键,则 name 列的哈希值将确定消息发送到哪个分区。 如果未设置分区键字段,则将向 null 消息键发送至。 消息键的格式为 json,如果 name 设置为键,例如 '{“name”:“Jack”}'。 所选字段必须是上游的现有字段。 ### 常见选项 源插件常用参数,详见源码[常用选项](../common-options/sink-common-options.md) . ## 任务示例 ### 简单 > 该示例定义了一个 SeaTunnel 同步任务,该任务通过 FakeSource 自动生成数据并将其发送到 Pulsar Sink。FakeSource 总共生成 16 行数据 (row.num=16),每行有两个字段,name(字符串类型)和 age(int 类型)。最终目标主题是test_topic主题中还将有 16 行数据。 如果您尚未安装和部署 SeaTunnel,则需要按照[安装Seatunnel](../../getting-started/locally/deployment.md) SeaTunnel 中的说明安装和部署 SeaTunnel。然后按照 [SeaTunnel 引擎快速入门](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 ```hocon # Defining the runtime environment env { # You can set flink configuration here execution.parallelism = 1 job.mode = "BATCH" } source { FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Pulsar { topic = "example" client.service-url = "localhost:pulsar://localhost:6650" admin.service-url = "http://my-broker.example.com:8080" plugin_output = "test" pulsar.config = { sendTimeoutMs = 30000 } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Qdrant.md ================================================ import ChangeLog from '../changelog/connector-qdrant.md'; # Qdrant > Qdrant 数据连接器 [Qdrant](https://qdrant.tech/) 是一个高性能的向量搜索引擎和向量数据库。 该连接器可用于将数据写入 Qdrant 集合。 ## 数据类型映射 | SeaTunnel 数据类型 | Qdrant 数据类型 | |---------------------|---------------| | TINYINT | INTEGER | | SMALLINT | INTEGER | | INT | INTEGER | | BIGINT | INTEGER | | FLOAT | DOUBLE | | DOUBLE | DOUBLE | | BOOLEAN | BOOL | | STRING | STRING | | ARRAY | LIST | | FLOAT_VECTOR | DENSE_VECTOR | | BINARY_VECTOR | DENSE_VECTOR | | FLOAT16_VECTOR | DENSE_VECTOR | | BFLOAT16_VECTOR | DENSE_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_VECTOR | 主键列的值将用作 Qdrant 中的点 ID。如果没有主键,则将使用随机 UUID。 ## 选项 | 名称 | 类型 | 必填 | 默认值 | |-----------------|--------|----|-----------| | collection_name | string | 是 | - | | batch_size | int | 否 | 64 | | host | string | 否 | localhost | | port | int | 否 | 6334 | | api_key | string | 否 | - | | use_tls | bool | 否 | false | | common-options | | 否 | - | ### collection_name [string] 要从中读取数据的 Qdrant 集合的名称。 ### batch_size [int] 每个 upsert 请求到 Qdrant 的批量大小。 ### host [string] Qdrant 实例的主机名。默认为 "localhost"。 ### port [int] Qdrant 实例的 gRPC 端口。 ### api_key [string] 用于身份验证的 API 密钥(如果设置)。 ### use_tls [bool] 是否使用 TLS(SSL)连接。如果使用 Qdrant 云(https),则需要。 ### 通用选项 接收插件的通用参数,请参考[源通用选项](../common-options/sink-common-options.md)了解详情。 ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Rabbitmq.md ================================================ import ChangeLog from '../changelog/connector-rabbitmq.md'; # Rabbitmq > Rabbitmq 数据接收器 ## 描述 该数据接收器是将数据写入Rabbitmq。 ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | |----------------------------|---------|------|-------| | host | string | yes | - | | port | int | yes | - | | virtual_host | string | yes | - | | username | string | yes | - | | password | string | yes | - | | queue_name | string | yes | - | | url | string | no | - | | network_recovery_interval | int | no | - | | topology_recovery_enabled | boolean | no | - | | automatic_recovery_enabled | boolean | no | - | | use_correlation_id | boolean | no | false | | connection_timeout | int | no | - | | rabbitmq.config | map | no | - | | common-options | | no | - | ### host [string] Rabbitmq服务器地址 ### port [int] Rabbitmq服务器端口 ### virtual_host [string] virtual host – 连接broker使用的vhost ### username [string] 连接broker时使用的用户名 ### password [string] 连接broker时使用的密码 ### url [string] 设置host、port、username、password和virtual host的简便方式。 ### queue_name [string] 数据写入的队列名。 ### schema [Config] #### fields [Config] 上游数据的模式字段。 ### network_recovery_interval [int] 自动恢复需等待多长时间才尝试重连,单位为毫秒。 ### topology_recovery_enabled [boolean] 设置为true,表示启用拓扑恢复。 ### automatic_recovery_enabled [boolean] 设置为true,表示启用连接恢复。 ### use_correlation_id [boolean] 接收到的消息是否都提供唯一ID,来删除重复的消息达到幂等(在失败的情况下) ### connection_timeout [int] TCP连接建立的超时时间,单位为毫秒;0代表不限制。 ### rabbitmq.config [map] In addition to the above parameters that must be specified by the RabbitMQ client, the user can also specify multiple non-mandatory parameters for the client, covering [all the parameters specified in the official RabbitMQ document](https://www.rabbitmq.com/configure.html). 除了上面提及必须设置的RabbitMQ客户端参数,你也还可以为客户端指定多个非强制参数,参见 [RabbitMQ官方文档参数设置](https://www.rabbitmq.com/configure.html)。 ### common options Sink插件常用参数,请参考[Sink常用选项](../common-options/sink-common-options.md)获取更多细节信息。 ## 示例 simple: ```hocon sink { RabbitMQ { host = "rabbitmq-e2e" port = 5672 virtual_host = "/" username = "guest" password = "guest" queue_name = "test1" rabbitmq.config = { requested-heartbeat = 10 connection-timeout = 10 } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Redis.md ================================================ import ChangeLog from '../changelog/connector-redis.md'; # Redis > Redis sink connector ## 描述 用于将数据写入 Redis。 ## 主要功能 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## 选项 | name | type | required | default value | |--------------------|---------|-----------------------|---------------| | host | string | `mode=single`时必须 | - | | port | int | 否 | 6379 | | key | string | 是 | - | | data_type | string | 是 | - | | batch_size | int | 否 | 10 | | user | string | 否 | - | | auth | string | 否 | - | | db_num | int | 否 | 0 | | mode | string | 否 | single | | nodes | list | `mode=cluster`时必须 | - | | format | string | 否 | json | | expire | long | 否 | -1 | | support_custom_key | boolean | 否 | false | | value_field | string | 否 | - | | hash_key_field | string | 否 | - | | hash_value_field | string | 否 | - | | field_delimiter | string | 否 | "," | | common-options | | 否 | - | ### host [string] Redis 主机地址 ### port [int] Redis 端口 ### key [string] 要写入 Redis 的键值。 例如,如果想使用上游数据中的某个字段值作为键值,可以将该字段名称指定给 key。 上游数据如下: | code | data | success | |------|------|---------| | 200 | 获取成功 | true | | 500 | 内部错误 | false | 如果将字段名称指定为 code 并将 data_type 设置为 key,将有两个数据写入 Redis: 1. `200 -> {code: 200, data: 获取成功, success: true}` 2. `500 -> {code: 500, data: 内部错误, success: false}` 如果将字段名称指定为 value 并将 data_type 设置为 key,则由于上游数据的字段中没有 value 字段,将只有一个数据写入 Redis: 1. `value -> {code: 500, data: 内部错误, success: false}` 请参见 data_type 部分以了解具体的写入规则。 当然,这里写入的数据格式只是以 json 为例,具体格式以用户配置的 `format` 为准。 ### data_type [string] Redis 数据类型,支持 `key` `hash` `list` `set` `zset` - key > 每个来自上游的数据都会更新到配置的 key,这意味着后面的数据会覆盖前面的数据,只有最后的数据会存储在该 key 中。 - hash > 每个来自上游的数据会根据字段拆分并写入 hash key,后面的数据会覆盖前面的数据。 - list > 每个来自上游的数据都会被添加到配置的 list key 中。 - set > 每个来自上游的数据都会被添加到配置的 set key 中。 - zset > 每个来自上游的数据都会以权重为 1 的方式添加到配置的 zset key 中。因此,zset 中数据的顺序基于数据的消费顺序。 ### user [string] Redis 认证用户,连接加密集群时需要 ### auth [string] Redis 认证密码,连接加密集群时需要 ### db_num [int] Redis 数据库索引 ID,默认连接到 db 0 ### mode [string] Redis 模式,`single` 或 `cluster`,默认是 `single` ### nodes [list] Redis 节点信息,在集群模式下使用,必须按如下格式: ["host1:port1", "host2:port2"] ### format [string] 上游数据的格式,目前只支持 `json`,`text`,默认 `json`。 当你指定格式为 `json` 时,例如: 上游数据如下: | code | data | success | |------|------|---------| | 200 | 获取成功 | true | 连接器会生成如下数据并写入 Redis: ```json {"code": 200, "data": "获取成功", "success": "true"} ``` 当你指定format为`text`,并设置field_delimiter为`#`时,连接器将生成如下数据并将其写入redis: ```text 200#get success#true ``` ### field_delimiter [string] 字段分隔符,用于告诉连接器如何分割字段。 目前仅当格式为text时需要配置。默认为","。 ### expire [long] 设置 Redis 的过期时间,单位为秒。默认值为 -1,表示键不会自动过期。 ### support_custom_key [boolean] 设置为true,表示启用自定义Key。 上游数据如下: | code | data | success | |------|------|---------| | 200 | 获取成功 | true | | 500 | 内部错误 | false | 可以使用`{`和`}`符号自定义Redis键名,`{}`中的字段名会被解析替换为上游数据中的某个字段值,例如:将字段名称指定为 `{code}` 并将 data_type 设置为 `key`,将有两个数据写入 Redis: 1. `200 -> {code: 200, data: 获取成功, success: true}` 2. `500 -> {code: 500, data: 内部错误, success: false}` Redis键名可以由固定部分和变化部分组成,通过Redis分组符号:连接,例如:将字段名称指定为 `code:{code}` 并将 data_type 设置为 `key`,将有两个数据写入 Redis: 1. `code:200 -> {code: 200, data: 获取成功, success: true}` 2. `code:500 -> {code: 500, data: 内部错误, success: false}` ### value_field [string] 要写入Redis的值的字段, `data_type` 支持 `key` `list` `set` `zset`. 当你指定Redis键名字段`key`指定为 `value`,值字段`value_field`指定为`data`,并将`data_type`指定为`key`时, 上游数据如下: | code | data | success | |------|------|---------| | 200 | 获取成功 | true | 如下的数据会被写入Redis: 1. `value -> 获取成功` ### hash_key_field [string] 要写入Redis的hash键字段, `data_type` 支持 `hash` ### hash_value_field [string] 要写入Redis的hash值字段, `data_type` 支持 `hash` 当你指定Redis键名字段`key`指定为 `value`,hash键字段`hash_key_field`指定为`data`,hash值字段`hash_value_field`指定为`success`,并将`data_type`指定为`hash`时, 上游数据如下: | code | data | success | |------|------|---------| | 200 | 获取成功 | true | 如下的数据会被写入Redis: 1. `value -> 获取成功 | true` ### common options Sink 插件通用参数,请参考 [Sink Common Options](../common-options/sink-common-options.md) 获取详情 ## 示例 简单示例: ```hocon Redis { host = localhost port = 6379 key = age data_type = list } ``` 自定义Key示例: ```hocon Redis { host = localhost port = 6379 key = "name:${name}" support_custom_key = true data_type = key } ``` 自定义Value示例: ```hocon Redis { host = localhost port = 6379 key = person value_field = "name" data_type = key } ``` 自定义HashKey和HashValue示例: ```hocon Redis { host = localhost port = 6379 key = person hash_key_field = "name" hash_value_field = "age" data_type = hash } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Redshift.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Redshift > JDBC Redshift 接收器连接器 ## 支持以下引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [更改数据捕获](../../introduction/concepts/connector-v2-features.md) > 使用 `Xa transactions` 确保 `exactly-once`. 因此,数据库只支持 `exactly-once` > 即支持 `Xa transactions`. 您可以设置 `is_exactly_once=true` 来启用它. ## 描述 通过jdbc写入数据. 支持批处理模式和流模式,支持并发写入,只支持一次语义 (使用 XA transaction guarantee). ## 支持的数据源列表 | 数据源 | 支持版本 | 驱动 | url | maven | |------------|----------------------------------------------------------|---------------------------------|-----------------------------------------|------------------------------------------------------------------------------| | redshift | 不同的依赖版本有不同的驱动程序类. | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [下载](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | ## 数据库相关性 ### 适用于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) 已放置在目录 `${SEATUNNEL_HOME}/plugins/`. ### 适用于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc driver jar package](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) 已放置在目录 `${SEATUNNEL_HOME}/lib/`. ## 数据类型映射 | SeaTunnel 数据类型 | Redshift 数据类型 | |-------------------------|--------------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT | SMALLINT | | INT | INTEGER | | BIGINT | BIGINT | | FLOAT | REAL | | DOUBLE | DOUBLE PRECISION | | DECIMAL | NUMERIC | | STRING(<=65535) | CHARACTER VARYING | | STRING(>65535) | SUPER | | BYTES | BINARY VARYING | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | MAP
    ARRAY
    ROW | SUPER | ## 任务示例 ### 简单示例 ``` sink { jdbc { url = "jdbc:redshift://localhost:5439/mydatabase" driver = "com.amazon.redshift.jdbc.Driver" username = "myUser" password = "myPassword" generate_sink_sql = true schema = "public" table = "sink_table" } } ``` ### CDC(更改数据捕获) 事件 > 我们也支持CDC更改数据。在这种情况下,您需要配置数据库、表和主键. ``` sink { jdbc { url = "jdbc:redshift://localhost:5439/mydatabase" driver = "com.amazon.redshift.jdbc.Driver" username = "myUser" password = "mypassword" generate_sink_sql = true schema = "public" table = "sink_table" # config update/delete primary keys primary_keys = ["id","name"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/RocketMQ.md ================================================ import ChangeLog from '../changelog/connector-rocketmq.md'; # RocketMQ > RocketMQ sink 连接器 ## 支持Apache RocketMQ版本 - 4.9.0 (或更新版本,供参考) ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们将使用2pc来保证消息精确一次到RocketMQ。 ## 描述 将数据行写入Apache RocketMQ主题 ## Sink 参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |----------------------|---------|----------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | string | 是 | - | `RocketMQ topic` 名称. | | name.srv.addr | string | 是 | - | `RocketMQ`名称服务器集群地址。 | | acl.enabled | Boolean | 否 | false | false | | access.key | String | 否 | | 当ACL_ENABLED为true时,access key不能为空。 | | secret.key | String | 否 | | 当ACL_ENABLED为true时, secret key 不能为空。 | | producer.group | String | 否 | SeaTunnel-producer-Group | SeaTunnel-producer-Group | | tag | String | 否 | - | `RocketMQ`消息标签。 | | partition.key.fields | array | 否 | - | - | | format | String | 否 | json | 数据格式。默认格式为json。可选text格式。默认字段分隔符为“,”。如果自定义分隔符,请添加“field_delimiter”选项。 | | field.delimiter | String | 否 | , | 自定义数据格式的字段分隔符。 | | producer.send.sync | Boolean | 否 | false | 如果为 true, 则消息将同步发送。 | | common-options | config | 否 | - | Sink插件常用参数,请参考[sink common options](../common-options/sink-common-options.md)了解详细信息。 | ### partition.key.fields [array] 配置哪些字段用作RocketMQ消息的键。 例如,如果要使用上游数据中的字段值作为键,可以为此属性指定字段名。 上游数据如下: | name | age | data | |------|-----|---------------| | Jack | 16 | data-example1 | | Mary | 23 | data-example2 | 如果name被设置为主键,那么name列的哈希值将决定消息被发送到哪个分区。 ## 任务示例 ### Fake 到 RocketMQ 简单示例 >数据是随机生成的,并异步发送到测试主题 ```hocon env { parallelism = 1 } source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { #如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表, #请前往https://seatunnel.apache.org/docs/category/transform } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic" } } ``` ### Rocketmq 到 Rocketmq 简单示例 > 使用RocketMQ时,会向c_int字段写入哈希数,该哈希数表示写入不同分区的分区数量。这是默认的异步写入方式 ```hocon env { parallelism = 1 } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic_sink" partition.key.fields = ["c_int"] } } ``` ### 时间戳消费写入示例 >这是流消费中特定的时间戳消费,当添加新分区时,程序将定期刷新感知和消费,并写入另一个主题类型 ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" start.mode = "CONSUME_FROM_FIRST_OFFSET" batch.size = "400" consumer.group = "test_topic_group" format = "json" format = json schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { #如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表, #请前往https://seatunnel.apache.org/docs/category/transform } sink { Rocketmq { name.srv.addr = "localhost:9876" topic = "test_topic" partition.key.fields = ["c_int"] producer.send.sync = true } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/S3-Redshift.md ================================================ import ChangeLog from '../changelog/connector-s3-redshift.md'; # S3Redshift >S3Redshift的作用是将数据写入S3,然后使用Redshift的COPY命令将数据从S3导入Redshift。 ## 描述 将数据输出到AWS Redshift。 >提示: >我们基于[S3File](S3File.md)来实现这个连接器。因此,您可以使用与S3File相同的配置。 >为了支持更多的文件类型,我们进行了一些权衡,因此我们使用HDFS协议对S3进行内部访问,而这个连接器需要一些hadoop依赖。 >它只支持hadoop版本**2.6.5+**。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保“精确一次”` - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json ## 参数 | 名称 | 类型 | 是否必填 | 默认值 | |----------------------------------|---------|----------|-----------------------------------------------------------| | jdbc_url | string | 是 | - | | jdbc_user | string | 是 | - | | jdbc_password | string | 是 | - | | execute_sql | string | 是 | - | | path | string | 是 | - | | bucket | string | 是 | - | | access_key | string | 否 | - | | access_secret | string | 否 | - | | hadoop_s3_properties | map | 否 | - | | file_name_expression | string | 否 | "${transactionId}" | | file_format_type | string | 否 | "text" | | filename_time_format | string | 否 | "yyyy.MM.dd" | | field_delimiter | string | 否 | '\001' | | row_delimiter | string | 否 | "\n" | | partition_by | array | 否 | - | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | | is_partition_field_write_in_file | boolean | 否 | false | | sink_columns | array | 否 | 当此参数为空时,所有字段都是sink列 | | is_enable_transaction | boolean | 否 | true | | batch_size | int | 否 | 1000000 | | common-options | | 否 | - | ### jdbc_url 连接到Redshift数据库的JDBC URL。 ### jdbc_user 连接到Redshift数据库的用户名。 ### jdbc_password 连接到Redshift数据库的密码。 ### execute_sql 数据写入S3后要执行的SQL。 示例: ```sql COPY target_table FROM 's3://yourbucket${path}' IAM_ROLE 'arn:XXX' REGION 'your region' format as json 'auto'; ``` `target_table`是Redshift中的表名。 `${path}`是写入S3的文件的路径。请确认您的sql包含此变量。并且不需要替换它。我们将在执行sql时替换它。 IAM_ROLE是有权访问S3的角色。 format是写入S3的文件的格式。请确认此格式与您在配置中设置的文件格式相同。 请参阅[Redshift COPY](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html)了解更多详情。 请确认该角色有权访问S3。 ### path [string] 目标目录路径是必填项。 ### bucket [string] s3文件系统的bucket地址,例如:`s3n://seatunnel-test`,如果使用`s3a`协议,则此参数应为`s3a://seatunnel-test`。 ### access_key [string] s3文件系统的access_key。如果未设置此参数,请确认凭据提供程序链可以正确进行身份验证,您可以检查这个[hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ### access_secret [string] s3文件系统的access_secret。如果未设置此参数,请确认凭据提供程序链可以正确进行身份验证,您可以检查这个[hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ### hadoop_s3_properties [map] 如果您需要添加其他选项,可以在此处添加并参考[Hadoop-AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ``` hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } ``` ### file_name_expression [string] `file_name_expression`描述了将在`path`中创建的文件表达式。我们可以在`file_name_expression`中添加变量`${now}`或`${uuid}`,类似于`test_${uuid}_${now}`, `${now}`表示当前时间,其格式可以通过指定选项`filename_time_format`来定义。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` 请注意,最终文件名将以file_format_type的后缀结尾,文本文件的后缀为“txt”。 ### filename_time_format [string] 当`file_name_expression`参数中的格式为`xxxx-${now}`时,`filename_time_format`可以指定路径的时间格式,默认值为`yyyy.MM.dd`。常用的时间格式如下: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | 请参阅[Java SimpleDateFormat](https://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html)了解详细的时间格式语法。 ### field_delimiter [string] 数据行中列之间的分隔符。仅被“text”和“csv”文件格式需要。 ### row_delimiter [string] 文件中行之间的分隔符。仅被“text”和“csv”文件格式需要。 ### partition_by [array] 基于选定字段对数据进行分区 ### partition_dir_expression [string] 如果指定了`partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 默认的`partition_dir_expression`是 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0`是第一个分区字段,`v0`是第一个划分字段的值。 ### is_partition_field_write_in_file [boolean] 如果`is_partition_field_write_in_file`为`true`,则分区字段及其值将写入数据文件。 例如,如果你想写一个Hive数据文件,它的值应该是“false”。 ### sink_columns [array] 哪些列需要写入文件,默认值是从“Transform”或“Source”获取的所有列。 字段的顺序决定了文件实际写入的顺序。 ### is_enable_transaction [boolean] 如果`is_enable_transaction`为true,我们将确保数据在写入目标目录时不会丢失或重复。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 现在只支持“true”。 ### batch_size [int] 文件中的最大行数。对于SeaTunnel引擎,文件中的行数由“batch_size”和“checkpoint.interval”共同决定。如果“checkpoint.interval”的值足够大,sink writer将在文件中写入行,直到文件中的行大于“batch_size”。如果“checkpoint.interval”较小,则接收器写入程序将在新的检查点触发时创建一个新文件。 ### common options Sink插件常用参数,请参考[Sink Common Options](../common-options/sink-common-options.md)了解详细信息。 ## 示例 用于 text 文件格式 ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' removequotes emptyasnull blanksasnull maxerror 100 delimiter '|' ;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "text" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` 用于 parquet 文件格式 ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as PARQUET;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/parquet" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "parquet" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` 用于 orc 文件格式 ```hocon S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as ORC;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/orc" row_delimiter="\n" partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" file_format_type = "orc" filename_time_format="yyyy.MM.dd" is_enable_transaction=true hadoop_s3_properties { "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/S3File.md ================================================ import ChangeLog from '../changelog/connector-file-s3.md'; # S3File > S3 文件 Sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用 2PC 提交来确保 `精确一次`。 - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表写入](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 描述 将数据输出到 AWS S3 文件系统。 ## 支持的数据源信息 | 数据源 | 支持的版本 | |--------|------------| | S3 | 当前版本 | ## 数据库依赖 > 如果您使用 Spark/Flink,为了使用此连接器,您必须确保您的 Spark/Flink 集群已经集成了 Hadoop。测试的 Hadoop 版本为 2.x。 > > 如果您使用 SeaTunnel引擎,当您下载并安装 SeaTunnel引擎时,它会自动集成 Hadoop jar 包。您可以在 `${SEATUNNEL_HOME}/lib` 下检查 jar 包以确认这一点。 > 要使用此连接器,您需要将 `hadoop-aws-3.1.4.jar` 和 `aws-java-sdk-bundle-1.12.692.jar` 放在 `${SEATUNNEL_HOME}/lib` 目录下。 ## 数据类型映射 如果写入 `csv`、`text` 文件类型,所有列都将为字符串类型。 ### Orc 文件类型 | SeaTunnel 数据类型 | Orc 数据类型 | |--------------------|---------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | BYTE | | SMALLINT | SHORT | | INT | INT | | BIGINT | LONG | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
    TIMESTAMP | TIMESTAMP | | ROW | STRUCT | | NULL | 不支持的数据类型 | | ARRAY | LIST | | Map | Map | ### Parquet 文件类型 | SeaTunnel 数据类型 | Parquet 数据类型 | |--------------------|---------------------| | STRING | STRING | | BOOLEAN | BOOLEAN | | TINYINT | INT_8 | | SMALLINT | INT_16 | | INT | INT32 | | BIGINT | INT64 | | FLOAT | FLOAT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | BYTES | BINARY | | DATE | DATE | | TIME
    TIMESTAMP | TIMESTAMP_MILLIS | | ROW | GroupType | | NULL | 不支持的数据类型 | | ARRAY | LIST | | Map | Map | ## Sink 选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |---------------------------------------|---------|------|-------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | | | tmp_path | string | 否 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用 `mv` 将临时目录提交到目标目录。需要一个 S3 目录。 | | bucket | string | 是 | - | | | fs.s3a.endpoint | string | 是 | - | | | fs.s3a.aws.credentials.provider | string | 是 | com.amazonaws.auth.InstanceProfileCredentialsProvider | 认证 s3a 的方式。目前仅支持 `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` 和 `com.amazonaws.auth.InstanceProfileCredentialsProvider`。 | | access_key | string | 否 | - | 仅当 fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider 时使用 | | secret_key | string | 否 | - | 仅当 fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider 时使用 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅当 custom_filename 为 true 时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅当 custom_filename 为 true 时使用 | | file_format_type | string | 否 | "csv" | | | field_delimiter | string | 否 | '\001' | 仅当 file_format 为 text 时使用 | | row_delimiter | string | 否 | "\n" | 仅当 file_format 为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 仅当 have_partition 为 true 时使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 仅当 have_partition 为 true 时使用 | | is_partition_field_write_in_file | boolean | 否 | false | 仅当 have_partition 为 true 时使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段均为 sink 列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅当 file_format 为 excel 时使用 | | sheet_name | string | 否 | Sheet${Random number} | 仅当 file_format 为 excel 时使用 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅当 file_format 为 csv 时使用 | | xml_root_tag | string | 否 | RECORDS | 仅当 file_format 为 xml 时使用,指定 XML 文件中根元素的标签名称。 | | xml_row_tag | string | 否 | RECORD | 仅当 file_format 为 xml 时使用,指定 XML 文件中数据行的标签名称。 | | xml_use_attr_format | boolean | 否 | - | 仅当 file_format 为 xml 时使用,指定是否使用标签属性格式处理数据。 | | single_file_mode | boolean | 否 | false | 每个并行度只会输出一个文件。当此参数开启时,batch_size 将不会生效。输出文件名不会有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅当 file_format 为 parquet 时使用 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅当 file_format 为 parquet 时使用 | | hadoop_s3_properties | map | 否 | | 如果您需要添加其他选项,可以在此处添加,并参考此[链接](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | | schema_save_mode | Enum | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 在开启同步任务之前,对目标路径进行不同的处理 | | data_save_mode | Enum | 否 | APPEND_DATA | 在开启同步任务之前,对目标路径中的数据文件进行不同的处理 | | enable_header_write | boolean | 否 | false | 仅当 file_format_type 为 text,csv 时使用。
    false: 不写入表头, true: 写入表头。 | | encoding | string | 否 | "UTF-8" | 仅当 file_format_type 为 json,text,csv,xml 时使用。 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### path [string] 存储数据文件的路径,支持变量替换。例如:path=/test/${database_name}/${schema_name}/${table_name} ### hadoop_s3_properties [map] 如果您需要添加其他选项,可以在此处添加,并参考此[链接](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) ``` hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } ``` ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅当 `custom_filename` 为 `true` 时使用 `file_name_expression` 描述了将创建到 `path` 中的文件表达式。我们可以在 `file_name_expression` 中添加变量 `${now}` 或 `${uuid}`,例如 `test_${uuid}_${now}`, `${now}` 表示当前时间,其格式可以通过指定选项 `filename_time_format` 来定义。 请注意,如果 `is_enable_transaction` 为 `true`,我们会在文件头部自动添加 `${transactionId}_`。 ### filename_time_format [string] 仅当 `custom_filename` 为 `true` 时使用 当 `file_name_expression` 参数中的格式为 `xxxx-${now}` 时,`filename_time_format` 可以指定路径的时间格式,默认值为 `yyyy.MM.dd`。常用的时间格式如下: | 符号 | 描述 | |------|--------------------| | y | 年 | | M | 月 | | d | 日 | | H | 小时 (0-23) | | m | 分钟 | | s | 秒 | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以文件格式类型的后缀结尾,文本文件的后缀为 `txt`。 ### field_delimiter [string] 行数据中列之间的分隔符。仅在 `text` 文件格式中需要。 ### row_delimiter [string] 文件中行之间的分隔符。仅在 `text`、`csv`、`json` 文件格式中需要。 ### have_partition [boolean] 是否需要处理分区。 ### partition_by [array] 仅当 `have_partition` 为 `true` 时使用。 根据选定的字段对数据进行分区。 ### partition_dir_expression [string] 仅当 `have_partition` 为 `true` 时使用。 如果指定了 `partition_by`,我们将根据分区信息生成相应的分区目录,最终文件将放置在分区目录中。 默认的 `partition_dir_expression` 为 `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0` 是第一个分区字段,`v0` 是第一个分区字段的值。 ### is_partition_field_write_in_file [boolean] 仅当 `have_partition` 为 `true` 时使用。 如果 `is_partition_field_write_in_file` 为 `true`,分区字段及其值将被写入数据文件。 例如,如果您想写入 Hive 数据文件,其值应为 `false`。 ### sink_columns [array] 哪些列需要写入文件,默认值为从 `Transform` 或 `Source` 获取的所有列。 字段的顺序决定了文件实际写入的顺序。 ### is_enable_transaction [boolean] 如果 `is_enable_transaction` 为 true,我们将确保在将数据写入目标目录时不会丢失或重复。 请注意,如果 `is_enable_transaction` 为 `true`,我们会在文件头部自动添加 `${transactionId}_`。 目前仅支持 `true`。 ### batch_size [int] 文件中的最大行数。对于 SeaTunnel Engine,文件中的行数由 `batch_size` 和 `checkpoint.interval` 共同决定。如果 `checkpoint.interval` 的值足够大,sink writer 将一直写入文件,直到文件中的行数超过 `batch_size`。如果 `checkpoint.interval` 较小,sink writer 将在新的 checkpoint 触发时创建一个新文件。 ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 提示:excel 类型不支持任何压缩格式 ### common options Sink 插件通用参数,请参考 [Sink 通用选项](../common-options/sink-common-options.md) 获取详细信息。 ### max_rows_in_memory [int] 当文件格式为 Excel 时,内存中可以缓存的最大数据项数。 ### sheet_name [string] 写入工作表的名称 ### csv_string_quote_mode [string] 当文件格式为 CSV 时,CSV 的字符串引用模式。 - ALL: 所有字符串字段都会被引用。 - MINIMAL: 引用包含特殊字符的字段,如字段分隔符、引用字符或行分隔符字符串中的任何字符。 - NONE: 从不引用字段。当数据中出现分隔符时,打印机会在其前面加上转义字符。如果未设置转义字符,格式验证将抛出异常。 ### xml_root_tag [string] 指定 XML 文件中根元素的标签名称。 ### xml_row_tag [string] 指定 XML 文件中数据行的标签名称。 ### xml_use_attr_format [boolean] 指定是否使用标签属性格式处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持将时间戳写入 Parquet INT96,仅对 parquet 文件有效。 ### parquet_avro_write_fixed_as_int96 [array] 支持将 12-byte 字段写入 Parquet INT96,仅对 parquet 文件有效。 ### schema_save_mode [Enum] 在开启同步任务之前,对目标路径进行不同的处理。 选项介绍: `RECREATE_SCHEMA` :当路径不存在时创建。如果路径已存在,则删除路径并重新创建。 `CREATE_SCHEMA_WHEN_NOT_EXIST` :当路径不存在时创建,路径存在时使用路径。 `ERROR_WHEN_SCHEMA_NOT_EXIST` :当路径不存在时报错 `IGNORE` :忽略表的处理 ### data_save_mode [Enum] 在开启同步任务之前,对目标路径中的数据文件进行不同的处理。 选项介绍: `DROP_DATA`:使用路径但删除路径中的数据文件。 `APPEND_DATA`:使用路径,并在路径中添加新文件以写入数据。 `ERROR_WHEN_DATA_EXISTS`:当路径中存在数据文件时,将报错。 ### encoding [string] 仅当 file_format_type 为 json,text,csv,xml 时使用。 写入文件的编码。此参数将由 `Charset.forName(encoding)` 解析。 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 示例 ### 简单示例 > 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并将其发送到 S3File Sink。FakeSource 总共生成 16 行数据 (row.num=16),每行有两个字段,name (字符串类型) 和 age (int 类型)。最终的目标 s3 目录将创建一个文件,并将所有数据写入其中。 > 在运行此作业之前,您需要创建 s3 路径:/seatunnel/text。如果您尚未安装和部署 SeaTunnel,您需要按照 [安装 SeaTunnel](../../getting-started/locally/deployment.md) 中的说明安装和部署 SeaTunnel。然后按照 [使用 SeaTunnel Engine 快速入门](../../getting-started/locally/quick-start-seatunnel-engine.md) 中的说明运行此作业。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件,仅用于测试和演示功能源插件 FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { c_map = "map>" c_array = "array" name = string c_boolean = boolean age = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } # 如果您想了解更多关于如何配置SeaTunnel以及查看完整的源插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source source { } transform { # 如果您想了解更多关于如何配置SeaTunnel以及查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction=true hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } # 如果您想了解更多关于如何配置SeaTunnel以及查看完整的接收插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` 对于文本文件格式,包含 `have_partition`、`custom_filename`、`sink_columns` 和 `com.amazonaws.auth.InstanceProfileCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction=true hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } ``` 对于Parquet文件格式,简单配置使用 `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/parquet" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "parquet" hadoop_s3_properties { "fs.s3a.buffer.dir" = "/data/st_test/s3a" "fs.s3a.fast.upload.buffer" = "disk" } } ``` 对于ORC文件格式,简单配置使用 `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` ```hocon S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel" path="/seatunnel/orc" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } ``` 多表写入和保存模式 ```hocon env { "job.name"="SeaTunnel_job" "job.mode"=STREAMING } source { MySQL-CDC { database-names=[ "wls_t1" ] table-names=[ "wls_t1.mysqlcdc_to_s3_t3", "wls_t1.mysqlcdc_to_s3_t4", "wls_t1.mysqlcdc_to_s3_t5", "wls_t1.mysqlcdc_to_s3_t1", "wls_t1.mysqlcdc_to_s3_t2" ] password="xxxxxx" username="xxxxxxxxxxxxx" url="jdbc:mysql://localhost:3306/qa_source" } } transform { } sink { S3File { bucket = "s3a://seatunnel-test" tmp_path = "/tmp/seatunnel/${table_name}" path="/test/${table_name}" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" file_format_type = "orc" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" } } ``` ### enable_header_write [boolean] 仅在 file_format_type 为 text 或 csv 时使用。false:不写入表头,true:写入表头。 ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/SelectDB-Cloud.md ================================================ import ChangeLog from '../changelog/connector-selectdb-cloud.md'; # SelectDB Cloud > SelectDB Cloud Sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 用于将数据发送到 SelectDB Cloud。支持流式和批处理模式。 SelectDB Cloud 接收器连接器的内部实现是在批量缓存后上传数据,并提交 CopyInto SQL 以将数据加载到表中。 ## 支持的数据源信息 :::提示 支持的版本 * 支持的 `SelectDB Cloud 版本 >= 2.2.x` ::: ## 接收器选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |--------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | load-url | String | 是 | - | `SelectDB Cloud` 仓库的 HTTP 地址,格式为 `warehouse_ip:http_port` | | jdbc-url | String | 是 | - | `SelectDB Cloud` 仓库的 JDBC 地址,格式为 `warehouse_ip:mysql_port` | | cluster-name | String | 是 | - | `SelectDB Cloud` 集群名称 | | username | String | 是 | - | `SelectDB Cloud` 用户名 | | password | String | 是 | - | `SelectDB Cloud` 用户密码 | | sink.enable-2pc | bool | 否 | true | 是否启用两阶段提交(2pc),默认为 true,以确保 Exactly-Once 语义。SelectDB 使用缓存文件加载数据。当数据量较大时,缓存数据可能会失效(默认过期时间为 1 小时)。如果遇到大量数据写入丢失的情况,请将 sink.enable-2pc 配置为 false。 | | table.identifier | String | 是 | - | `SelectDB Cloud` 表的名称,格式为 `database.table` | | sink.enable-delete | bool | 否 | false | 是否启用删除功能。此选项要求 SelectDB Cloud 表启用批量删除功能,并且仅支持 Unique 模型。 | | sink.max-retries | int | 否 | 3 | 写入数据库失败时的最大重试次数 | | sink.buffer-size | int | 否 | 10 * 1024 * 1024 (1MB) | 用于流式加载的数据缓存缓冲区大小 | | sink.buffer-count | int | 否 | 10000 | 用于流式加载的数据缓存缓冲区数量 | | selectdb.config | map | 是 | - | 此选项用于在自动生成 SQL 时支持 `insert`、`delete` 和 `update` 等操作,并支持多种格式。 | ## 数据类型映射 | SelectDB Cloud 数据类型 | SeaTunnel 数据类型 | |--------------------------|-----------------------------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT
    TINYINT | | INT | INT
    SMALLINT
    TINYINT | | BIGINT | BIGINT
    INT
    SMALLINT
    TINYINT | | LARGEINT | BIGINT
    INT
    SMALLINT
    TINYINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE
    FLOAT | | DECIMAL | DECIMAL
    DOUBLE
    FLOAT | | DATE | DATE | | DATETIME | TIMESTAMP | | CHAR | STRING | | VARCHAR | STRING | | STRING | STRING | | ARRAY | ARRAY | | MAP | MAP | | JSON | STRING | | HLL | 尚未支持 | | BITMAP | 尚未支持 | | QUANTILE_STATE | 尚未支持 | | STRUCT | 尚未支持 | #### 支持的导入数据格式 支持的格式包括 CSV 和 JSON ## 任务示例 ### 简单示例 > 以下示例描述了将多种数据类型写入 SelectDBCloud,用户需要在下游创建相应的表 ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "json" } } } ``` ### 使用 JSON 格式导入数据 ``` sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "json" } } } ``` ### 使用 CSV 格式导入数据 ``` sink { SelectDBCloud { load-url = "warehouse_ip:http_port" jdbc-url = "warehouse_ip:mysql_port" cluster-name = "Cluster" table.identifier = "test.test" username = "admin" password = "******" selectdb.config { file.type = "csv" file.column_separator = "," file.line_delimiter = "\n" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/SensorsData.md ================================================ import ChangeLog from '../changelog/connector-sensorsdata.md'; # SensorsData > SensorsData Sink 连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 一个 Sink 插件,使用 SensorsData SDK 发送数据记录。 ## Sink 选项 | 参数名 | 类型 | 必须 | 默认值 | |---------------------------|---------|------|--------| | server_url | string | 是 | - | | bulk_size | int | 否 | 50 | | max_cache_row_size | int | 否 | 0 | | consumer | string | 否 | batch | | entity_name | string | 是 | users | | record_type | string | 是 | users | | schema | string | 是 | users | | distinct_id_column | string | 是 | - | | identity_fields | array | 是 | - | | property_fields | array | 是 | - | | event_name | string | 是 | - | | time_column | string | 是 | - | | time_free | boolean | 否 | false | | detail_id_column | string | 否 | - | | item_id_column | string | 否 | - | | item_type_column | string | 否 | - | | skip_error_record | boolean | 否 | false | | instant_events | array | 否 | - | | distinct_id_by_identities | boolean | 否 | false | | null_as_profile_unset | boolean | 否 | false | | common-options | | 否 | - | ## 参数解释 ### server_url [string] SensorsData 数据 Sink 地址,格式为 `https://${host}:8106/sa?project=${project}` ### bulk_size [int] SensorsData SDK 中触发刷新操作的阈值。当内存缓存队列达到此值时,缓存中的数据将被发送。默认值为 50。 ### max_cache_row_size [int] SensorsData SDK 的最大缓存刷新大小。如果超过此值,将立即触发刷新操作。默认值为 0,取决于 bulkSize。 ### consumer [string] 当 consumer 设置为 "console" 时,数据将输出到控制台而不是发送到服务器。 ### entity_name [string] 接收数据记录的 SensorsData 实体数据模型的实体名称。 ### record_type [string] SensorsData 实体数据模型的记录类型。 ### schema [string] SensorsData 实体数据模型的模式名称。 ### distinct_id_column [string] 用户实体的 distinct id 列。 ### identity_fields [array] 用户实体的身份字段。 ### property_fields [array] 数据记录的属性字段。支持的类型: - BOOLEAN - DECIMAL - INT - BIGINT - FLOAT - DOUBLE - NUMBER - STRING - DATE - TIMESTAMP - LIST - LIST_COMMA - LIST_SEMICOLON ### event_name [string] 目前支持两种格式: 1. 填入事件记录的名称。 2. 使用来自上游数据的字段值作为事件名称,格式为 `${your field name}`,其中事件名称是上游数据列的值。 例如,上游数据如下: | name | prop1 | prop2 | |----------|-------|---------------| | Purchase | 16 | data-example1 | | Order | 23 | data-example2 | 如果将 `${name}` 设置为事件名称,第一行的事件名称为 "Purchase",第二行的事件名称为 "Order"。 ### time_column [string] 事件记录的时间列。 ### time_free [boolean] 启用历史数据模式。 ### detail_id_column [string] 用户实体的详细 id 列。 ### item_id_column [string] 项目实体的项目 id 列。 ### item_type_column [string] 项目实体的项目类型列。 ### skip_error_record [boolean] 是否忽略转换数据记录中的错误。 ### instant_events [array] 给定事件名称列表,将事件标记为即时事件。 ### distinct_id_by_identities [boolean] 启用后,此选项在 distinct_id_column 值为 null 时,自动使用 identity_fields 列中的值填充 distinct_id。这确保 SensorsData 接收到所需的非 null distinct_id 值。 ### null_as_profile_unset [boolean] 启用后,配置文件属性中的 null 值将转换为配置文件取消设置操作,有效地从配置文件中删除现有值。 ### 通用选项 Sink 插件通用参数,请参考 [Sink 通用选项](common-options.md) 详见 ## 示例 ### 基本事件跟踪 ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = events schema = events event_name = "$AppStart" time_column = col_date distinct_id_column = col_id identity_fields = [ { source = col_id, target = "$identity_login_id" } { source = col_id, target = "$identity_distinct_id" } ] property_fields = [ { target = prop1, source = col1, type = INT } { target = prop2, source = col2, type = BIGINT } { target = prop3, source = col3, type = STRING } { target = prop4, source = col4, type = BOOLEAN } ] skip_error_record = true } } ``` ### 动态事件名称 ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = events schema = events event_name = "${event_type}" # 使用来自数据的动态事件名称 time_column = event_timestamp distinct_id_column = user_id identity_fields = [ { source = user_id, target = "$identity_login_id" } { source = user_id, target = "$identity_distinct_id" } ] property_fields = [ { target = "price", source = amount, type = DECIMAL } { target = "category", source = product_category, type = STRING } { target = "device", source = device_type, type = STRING } ] instant_events = ["$AppStart", "$AppEnd"] # 将特定事件标记为即时事件 } } ``` ### 配置文件属性更新 ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true entity_name = users record_type = profile schema = users distinct_id_column = user_id identity_fields = [ { source = email, target = "$identity_email" } { source = phone, target = "$identity_phone" } ] property_fields = [ { target = "name", source = full_name, type = STRING } { target = "age", source = user_age, type = INT } { target = "gender", source = user_gender, type = STRING } { target = "location", source = user_location, type = STRING } ] null_as_profile_unset = true # 当为 null 时删除属性 } } ``` ### 项目跟踪 ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" time_free = true record_type = items schema = items event_name = "$ItemViewed" time_column = view_time distinct_id_column = user_id identity_fields = [ { source = user_id, target = "$identity_login_id" } ] property_fields = [ { target = "view_duration", source = duration, type = INT } { target = "referrer", source = referrer_url, type = STRING } ] item_id_column = product_id item_type_column = product_type } } ``` ### 控制台输出(用于测试) ```hocon sink { SensorsData { server_url = "http://10.1.136.63:8106/sa?project=default" consumer = "console" # 输出到控制台而不是发送到服务器 record_type = events schema = events event_name = "$TestEvent" time_column = timestamp distinct_id_column = test_id property_fields = [ { target = "test", source = test_field, type = STRING } ] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Sentry.md ================================================ import ChangeLog from '../changelog/connector-sentry.md'; # Sentry ## 描述 给哨兵写入消息. ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------------------|---------|----|---------------| | dsn | string | 是 | - | | env | string | 否 | - | | release | string | 否 | - | | cacheDirPath | string | 否 | - | | enableExternalConfiguration | boolean | 否 | - | | maxCacheItems | number | 否 | - | | flushTimeoutMills | number | 否 | - | | maxQueueSize | number | 否 | - | | common-options | | 否 | - | ### dsn [string] DSN告诉SDK将事件发送到何处. ### env [string] 指定环境 ### release [string] 指定版本 ### cacheDirPath [string] 缓存脱机事件的缓存目录路径 ### enableExternalConfiguration [boolean] 如果启用了从外部源加载属性. ### maxCacheItems [number] 用于限制事件数量的最大缓存项默认值为30 ### flushTimeoutMillis [number] 控制冲洗前等待的秒数。Sentry SDK缓存来自后台队列的事件,并为该队列提供一定数量的待处理事件。默认值为15000=15s ### maxQueueSize [number] 将事件/信封刷新到磁盘之前的最大队列大小 ### common options 接收器插件常用参数,详见 [Sink 常见选项](../common-options/sink-common-options.md) ## 示例 ``` Sentry { dsn = "https://xxx@sentry.xxx.com:9999/6" enableExternalConfiguration = true maxCacheItems = 1000 env = prod } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/SftpFile.md ================================================ import ChangeLog from '../changelog/connector-file-sftp.md'; # SftpFile > Sftp file Sink 连接器 ## 描述 将数据输出到Sftp。 :::提示 如果你使用spark/flink,为了使用这个连接器,你必须确保你的spark/flink集群已经集成了hadoop。测试的hadoop版本是2.x。 如果你使用SeaTunnel引擎,当你下载并安装SeaTunnel引擎时,它会自动集成hadoop jar包。您可以在${SEATUNNEL_HOME}/lib下找到jar包。 ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 默认情况下,我们使用2PC commit来确保`精确一次` - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] canal_json - [x] debezium_json - [x] maxwell_json ## 参数 | 名称 | 类型 | 是否必填 | 默认值 | 备注 | |---------------------------------------|---------|------|--------------------------------------------|-----------------------------------------------------------| | host | string | 是 | - | | | port | int | 是 | - | | | user | string | 是 | - | | | password | string | 是 | - | | | path | string | 是 | - | | | tmp_path | string | 是 | /tmp/seatunnel | 结果文件将首先写入临时路径,然后使用`mv`将临时目录剪切到目标目录。需要一个FTP目录。 | | custom_filename | boolean | 否 | false | 是否需要自定义文件名 | | file_name_expression | string | 否 | "${transactionId}" | 仅在custom_filename为true时使用 | | filename_time_format | string | 否 | "yyyy.MM.dd" | 仅在custom_filename为true时使用 | | file_format_type | string | 否 | "csv" | | | field_delimiter | string | 否 | '\001' | 仅当file_format_type为text时使用 | | row_delimiter | string | 否 | "\n" | 仅当file_format_type为 `text`、`csv`、`json` 时使用 | | have_partition | boolean | 否 | false | 是否需要处理分区。 | | partition_by | array | 否 | - | 只有在have_partition为true时才使用 | | partition_dir_expression | string | 否 | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | 只有在have_partition为true时才使用 | | is_partition_field_write_in_file | boolean | 否 | false | 只有在have_partition为true时才使用 | | sink_columns | array | 否 | | 当此参数为空时,所有字段都是sink列 | | is_enable_transaction | boolean | 否 | true | | | batch_size | int | 否 | 1000000 | | | compress_codec | string | 否 | none | | | common-options | object | 否 | - | | | max_rows_in_memory | int | 否 | - | 仅当file_format_type为excel时使用。 | | sheet_name | string | 否 | Sheet${Random number} | 仅当file_format_type为excel时使用。 | | csv_string_quote_mode | enum | 否 | MINIMAL | 仅当file_format_type为csv时使用。 | | xml_root_tag | string | 否 | RECORDS | 仅当file_format_type为xml时使用 | | xml_row_tag | string | 否 | RECORD | 仅当file_format_type为xml时使用 | | xml_use_attr_format | boolean | 否 | - | 仅当file_format_type为xml时使用 | | single_file_mode | boolean | 否 | false | 每个并行处理只会输出一个文件。启用此参数后,batch_size将不会生效。输出文件名没有文件块后缀。 | | create_empty_file_when_no_data | boolean | 否 | false | 当上游没有数据同步时,仍然会生成相应的数据文件。 | | parquet_avro_write_timestamp_as_int96 | boolean | 否 | false | 仅当file_format_type为parquet时使用 | | enable_header_write | boolean | 否 | false | 仅当file_format_type为text、csv时使用
    false:不写标头,true:写标头。 | | parquet_avro_write_fixed_as_int96 | array | 否 | - | 仅当file_format_type为parquet时使用 | | encoding | string | 否 | "UTF-8" | 仅当file_format_type为json、text、csv、xml时使用。 | | schema_save_mode | string | 否 | CREATE_SCHEMA_WHEN_NOT_EXIST | 现有目录处理方式 | | data_save_mode | string | 否 | APPEND_DATA | 现有数据处理方式 | | merge_update_event | boolean | 否 | false | 仅当file_format_type为canal_json、debezium_json、maxwell_json. | ### host [string] 目标sftp主机,必填。 ### port [int] 目标sftp端口,必填。 ### user [string] 目标sftp用户,必填。 ### password [string] 目标sftp密码,必填。 ### path [string] 目标目录路径,必填。 ### custom_filename [boolean] 是否自定义文件名 ### file_name_expression [string] 仅在`custom_filename`为`true`时使用。 `file_name_expression`描述了将在`path`中创建的文件表达式。我们可以在`file_name_expression`中添加变量`${now}`或`${uuid}`,类似于`test_${uuid}_${now}`, `${now}`表示当前时间,其格式可以通过指定选项`filename_time_format`来定义。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 ### filename_time_format [string] 仅在`custom_filename`为`true`时使用。 当`file_name_expression`参数中的格式为`xxxx-${now}`时,`filename_time_format`可以指定路径的时间格式,默认值为`yyyy.MM.dd`。常用的时间格式如下: | Symbol | Description | |--------|--------------------| | y | Year | | M | Month | | d | Day of month | | H | Hour in day (0-23) | | m | Minute in hour | | s | Second in minute | ### file_format_type [string] 我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `canal_json` `debezium_json` `maxwell_json` 请注意,最终文件名将以file_format_type的后缀结尾,文本文件的后缀为`txt`。 ### field_delimiter [string] 数据行中列之间的分隔符。仅在`text`文件格式中需要。 ### row_delimiter [string] 文件中行之间的分隔符。仅在 `text`、`csv`、`json` 文件格式中需要。 ### have_partition [boolean] 是否需要处理分区。 ### partition_by [array] 仅在`have_partition`为`true`时使用。 根据所选字段对数据进行分区。 ### partition_dir_expression [string] 仅在`have_partition`为`true`时使用。 如果指定了`partition_by`,我们将根据分区信息生成相应的分区目录,并将最终文件放置在分区目录中。 默认的`partition_dir_expression`是`${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`。`k0`是第一个分区字段,`v0`是第一个划分字段的值。 ### is_partition_field_write_in_file [boolean] 仅在`have_partition`为`true`时使用。 如果`is_partition_field_write_in_file`为`true`,则分区字段及其值将写入数据文件。 例如,如果你想写一个Hive数据文件,它的值应该是`false`。 ### sink_columns [array] 哪些列需要写入文件,默认值是从`Transform`或`Source`获取的所有列。 字段的顺序决定了文件实际写入的顺序。 ### is_enable_transaction [boolean] 如果`is_enable_transaction`为`true`,我们将确保数据在写入目标目录时不会丢失或重复。 请注意,如果`is_enable_transaction`为`true`,我们将自动添加`${transactionId}_`在文件的开头。 现在只支持`true`。 ### batch_size [int] 文件中的最大行数。对于SeaTunnel引擎,文件中的行数由`batch_size`和`checkpoint.interval`共同决定。如果`checkpoint.interval`的值足够大,sink writer将在文件中写入行,直到文件中的行大于`batch_size`。如果`checkpoint.interval`较小,则接收器写入程序将在新的检查点触发时创建一个新文件。 ### compress_codec [string] 文件的压缩编解码器和支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc: `lzo` `snappy` `lz4` `zlib` `none` - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` 提示:excel类型不支持任何压缩格式 ### common options Sink插件常用参数,请参考[Sink common Options](../common-options/sink-common-options.md)了解详细信息。 ### max_rows_in_memory 当文件格式为Excel时,内存中可以缓存的最大数据项数。 ### sheet_name 编写工作簿的工作表 ### csv_string_quote_mode [string] 当文件格式为CSV时,CSV的字符串引用模式。 - ALL:所有字符串字段都将被引用。 - MINIMAL:包含特殊字符的引号字段,如字段分隔符、引号字符或行分隔符字符串中的任何字符。 - NONE:从不引用字段。当分隔符出现在数据中时,打印机会用转义符作为前缀。如果未设置转义符,格式验证将抛出异常。 ### xml_root_tag [string] 指定XML文件中根元素的标记名。 ### xml_row_tag [string] 指定XML文件中数据行的标记名称。 ### xml_use_attr_format [boolean] 指定是否使用标记属性格式处理数据。 ### parquet_avro_write_timestamp_as_int96 [boolean] 支持从时间戳写入Parquet INT96,仅适用于parquet文件。 ### parquet_avro_write_fixed_as_int96 [array] 支持从12-byte字段写入Parquet INT96,仅适用于parquet文件。 ### enable_header_write [boolean] 仅当file_format_type为text、csv时使用。false:不写标头,true:写标头。 ### encoding [string] 仅当file_format_type为json、text、csv、xml时使用。 要写入的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### schema_save_mode [string] 现有的目录处理方法。 - RECREATE_SCHEMA:当目录不存在时创建,当目录存在时删除并重新创建 - CREATE_SCHEMA_WHEN_NOT_EXIST:当目录不存在时创建,当目录存在时跳过 - ERROR_WHEN_SCHEMA_NOT_EXIST:当目录不存在时,将报告错误 - IGNORE:忽略对表的处理 ### data_save_mode [string] 现有的数据处理方法。 -DROP_DATA:保留目录并删除数据文件 -APPEND_DATA:保留目录,保留数据文件 -ERROR_WHEN_DATA_EXISTS:当有数据文件时,会报告错误 ### merge_update_event [boolean] 仅当file_format_type为canal_json、debezium_json、maxwell_json时使用. 设置成true,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 会合并成 UPDATE; 设置成false,序列化数据时,UPDATE_AFTER 和 UPDATE_BEFORE 不会合并; ## 示例 对于具有`have_partition`、`custom_filename`和`sink_columns`的文本文件格式 ```bash SftpFile { host = "xxx.xxx.xxx.xxx" port = 22 user = "username" password = "password" path = "/data/sftp/seatunnel/job1" tmp_path = "/data/sftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true } ``` 当我们的源端是多个表,并且希望不同的表达式到不同的目录时,我们可以这样配置 ```hocon SftpFile { host = "xxx.xxx.xxx.xxx" port = 22 user = "username" password = "password" path = "/data/sftp/seatunnel/job1/${table_name}" tmp_path = "/data/sftp/seatunnel/tmp" file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true custom_filename = true file_name_expression = "${transactionId}_${now}" filename_time_format = "yyyy.MM.dd" sink_columns = ["name","age"] is_enable_transaction = true schema_save_mode=RECREATE_SCHEMA data_save_mode=DROP_DATA } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Slack.md ================================================ import ChangeLog from '../changelog/connector-slack.md'; # Slack > Slack 接收器连接器 ## 支持以下引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 用于将数据发送到Slack Channel.两者都支持流媒体和批处理模式. > 例如,如果来自上游的数据是 [`age: 12, name: huan`], 则发送到套接字服务器的内容如下: `{"name":"huan","age":17}` ## 数据类型映射 所有数据类型都映射到字符串. ## 选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |----------------|--------|----------|---------|----------------------------------------------------------------| | webhooks_url | String | Yes | - | Slack webhook 的 url | | oauth_token | String | Yes | - | 用于实际身份验证的Slack oauth令牌 | | slack_channel | String | Yes | - | 用于数据写入的slack channel | | common-options | | no | - | 接收器插件常用参数, 详见 [Sink 常见选项](../common-options/sink-common-options.md) | ## 任务示例 ### 简单示例 ```hocon sink { SlackSink { webhooks_url = "https://hooks.slack.com/services/xxxxxxxxxxxx/xxxxxxxxxxxx/xxxxxxxxxxxxxxxx" oauth_token = "xoxp-xxxxxxxxxx-xxxxxxxx-xxxxxxxxx-xxxxxxxxxxx" slack_channel = "channel name" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Sls.md ================================================ import ChangeLog from '../changelog/connector-sls.md'; # Sls > Sls sink connector ## Support Those Engines > Spark
    > Flink
    > Seatunnel Zeta
    ## 主要特性 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 Sink connector for Aliyun Sls. 从写入数据到阿里云Sls日志服务 为了使用Sls连接器,需要以下依赖关系。 它们可以通过install-plugin.sh或Maven中央存储库下载。 | Datasource | Supported Versions | Maven | |------------|--------------------|-----------------------------------------------------------------------------------| | Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-sls) | ## 支持的数据源信息 | Name | Type | Required | Default | Description | |-------------------------------------|----------|----------|-------------------|------------------------------------------------------------------------------------------------------------------------------------| | project | String | Yes | - | [阿里云 Sls 项目](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | | logstore | String | Yes | - | [阿里云 Sls 日志库](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | | endpoint | String | Yes | - | [阿里云访问服务点](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | | access_key_id | String | Yes | - | [阿里云访问用户ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | access_key_secret | String | Yes | - | [阿里云访问用户密码](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | source | String | No | SeaTunnel-Source | 在sls中数据来源标记 | | topic | String | No | SeaTunnel-Topic | 在sls中数据主题标记 | ## 任务示例 ### 简单示例 > 此示例写入sls的logstore1的数据。如果您尚未安装和部署SeaTunnel,则需要按照安装SeaTunnel中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 [创建RAM用户及授权](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4), 请确认RAM用户有足够的权限来读取及管理数据,参考:[RAM自定义授权示例](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 30000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields = { id = "int" name = "string" description = "string" weight = "string" } } } } sink { Sls { endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" project = "project1" logstore = "logstore1" access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Snowflake.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Snowflake > JDBC Snowflake Sink连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [(CDC)](../../introduction/concepts/connector-v2-features.md) ## 描述 通过JDBC写入数据。支持批处理模式和流处理模式,支持并发写入。 ## 支持的数据源列表 | 数据源 | 支持的版本 | 驱动类 | URL | Maven | |------------|--------------------------------------------------------------|---------------------------------------------|--------------------------------------------------------------|---------------------------------------------------------------------------| | Snowflake | 不同依赖版本对应不同的驱动类。 | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [下载](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | ## 数据库依赖 > 请下载支持列表中对应的'Maven'依赖,并将其复制到'$SEATUNNEL_HOME/plugins/jdbc/lib/'工作目录下
    > 例如Snowflake数据源:cp snowflake-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | Snowflake 数据类型 | SeaTunnel 数据类型 | |--------------------------------------------------------------------------|--------------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT
    BYTEINT
    | SHORT_TYPE | | INT
    INTEGER
    | INT | | BIGINT | LONG | | DECIMAL
    NUMERIC
    NUMBER
    | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的大小>38) | DECIMAL(38,18) | | REAL
    FLOAT4 | FLOAT | | DOUBLE
    DOUBLE PRECISION
    FLOAT8
    FLOAT
    | DOUBLE | | CHAR
    CHARACTER
    VARCHAR
    STRING
    TEXT
    VARIANT
    OBJECT| STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP
    TIMESTAMP_LTZ
    TIMESTAMP_NTZ
    TIMESTAMP_TZ | TIMESTAMP | | BINARY
    VARBINARY
    GEOGRAPHY
    GEOMETRY | BYTES | ## 配置选项 | 名称 | 类型 | 必填 | 默认值 | 描述 | |------------------------------|---------|------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC连接的URL。参考示例:jdbc:snowflake://.snowflakecomputing.com | | driver | String | 是 | - | 用于连接远程数据源的JDBC类名,
    如果使用Snowflake,值为`net.snowflake.client.jdbc.SnowflakeDriver`。 | | username | String | 否 | - | 连接实例的用户名 | | password | String | 否 | - | 连接实例的密码 | | query | String | 否 | - | 使用此SQL将上游输入数据写入数据库。例如`INSERT ...`,`query`具有更高的优先级 | | database | String | 否 | - | 使用此`database`和`table-name`自动生成SQL并接收上游输入数据写入数据库。
    此选项与`query`互斥,且具有更高的优先级。 | | table | String | 否 | - | 使用`database`和此`table-name`自动生成SQL并接收上游输入数据写入数据库。
    此选项与`query`互斥,且具有更高的优先级。 | | primary_keys | Array | 否 | - | 此选项用于在自动生成SQL时支持`insert`、`delete`和`update`等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接的操作的等待时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败(executeBatch)的重试次数 | | batch_size | Int | 否 | 1000 | 对于批处理写入,当缓冲的记录数达到`batch_size`或时间达到`checkpoint.interval`时,
    数据将被刷新到数据库中 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时时间,默认为-1(永不超时)。注意,设置超时可能会影响
    精确一次语义 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交 | | properties | Map | 否 | - | 额外的连接配置参数,当properties和URL中有相同参数时,优先级由驱动程序的
    具体实现决定。例如,在MySQL中,properties优先于URL。 | | common-options | | 否 | - | 接收器插件通用参数,详情请参考[接收器通用选项](../common-options/sink-common-options.md) | | enable_upsert | Boolean | 否 | true | 通过主键存在启用upsert,如果任务没有键重复数据,将此参数设置为`false`可以加快数据导入速度 | ## 提示 > 如果未设置`partition_column`,将以单并发运行,如果设置了`partition_column`,将根据任务的并发度并行执行。 ## 任务示例 ### 简单示例 > 此示例定义了一个SeaTunnel同步任务,通过FakeSource自动生成数据并发送到JDBC Sink。FakeSource总共生成16行数据(row.num=16),每行有两个字段,name(字符串类型)和age(int类型)。最终目标表`test_table`中也将有16行数据。在运行此作业之前,您需要在Snowflake数据库中创建数据库`test`和表`test_table`。如果您尚未安装和部署SeaTunnel,请按照[安装SeaTunnel](../../getting-started/locally/deployment.md)中的说明进行安装和部署。然后按照[使用SeaTunnel Engine快速入门](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件,**仅用于测试和演示功能源插件** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # 如果您想了解更多关于如何配置SeaTunnel的信息,并查看完整的源插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果您想了解更多关于如何配置SeaTunnel的信息,并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # 如果您想了解更多关于如何配置SeaTunnel的信息,并查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` ### CDC(变更数据捕获)事件 > 我们也支持CDC变更数据。在这种情况下,您需要配置`database`、`table`和`primary_keys`。 ``` sink { jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" username = "root" password = "123456" generate_sink_sql = true # 您需要同时配置database和table database = test table = sink_table primary_keys = ["id","name"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Socket.md ================================================ import ChangeLog from '../changelog/connector-socket.md'; # Socket > Socket 数据接收器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## 描述 用于向Socket Server发送数据。两者都支持流媒体和批处理模式。 > 例如,如果来自上游的数据是[`age:12,name:jared`],则发送到Socket服务器的内容如下:`{"name":"jared","age":17}` ## Sink 选项 | 名称 | 类型 | 是否必传 | 默认值 | 描述 | |----------------|---------|----------|---------|-----------------------------------------------------------------------------------------------------------------| | host | String | 是 | | socket 服务器主机 | | port | Integer | 是 | | socket 服务器端口 | | max_retries | Integer | 否 | 3 | 发送记录的重试失败次数 | | common-options | | 否 | - | 源插件常用参数,详见[Source common Options](../sink common-Options.md) | ## 任务示例 > 这是写入Socket端的随机生成数据 ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { FakeSource { plugin_output = "fake" schema = { fields { name = "string" age = "int" } } } } sink { Socket { host = "localhost" port = 9999 } } ``` * 启动端口侦听 ```shell nc -l -v 9999 ``` * 启动SeaTunnel任务 * Socket 服务器控制台打印数据 ```text {"name":"jared","age":17} ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/SqlServer.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # SQLServer > JDBC SQLServer Sink 连接器 ## 支持的 SQL Server 版本 - server:2008(或更高版本,仅供参考) ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过 JDBC 写入数据。支持批处理和流处理模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) > 使用 `Xa 事务` 来保证 `精确一次`。因此仅支持支持 `Xa 事务` 的数据库。可以通过设置 `is_exactly_once=true` 来启用。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动类名 | URL 格式 | Maven 依赖 | |-----------|--------------------------|-----------------------------------------------|--------------------------------------------|---------------------------------------------------------------------------------------------| | SQL Server | 支持版本 >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [下载](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | ## 数据库依赖 > 请下载支持列表中对应的 'Maven' 依赖,并将其复制到 `$SEATUNNEL_HOME/plugins/jdbc/lib/` 工作目录中。
    > 例如 SQL Server 数据源:`cp mssql-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/` ## 数据类型映射 | SQL Server 数据类型 | SeaTunnel 数据类型 | |-----------------------------------------|------------------------------------------------------------------------------------------------------| | BIT | BOOLEAN | | TINYINT
    SMALLINT | SHORT | | INTEGER | INT | | BIGINT | LONG | | DECIMAL
    NUMERIC
    MONEY
    SMALLMONEY | DECIMAL((获取指定列的列大小)+1,
    (获取指定列的小数点右侧的位数))) | | REAL | FLOAT | | FLOAT | DOUBLE | | CHAR
    NCHAR
    VARCHAR
    NTEXT
    NVARCHAR
    TEXT | STRING | | DATE | LOCAL_DATE | | TIME | LOCAL_TIME | | DATETIME
    DATETIME2
    SMALLDATETIME
    DATETIMEOFFSET | LOCAL_DATE_TIME | | TIMESTAMP
    BINARY
    VARBINARY
    IMAGE
    UNKNOWN | 尚未支持 | ## 接收器选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:`jdbc:sqlserver://localhost:1433;databaseName=mydatabase` | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,如果使用 SQL Server,值为 `com.microsoft.sqlserver.jdbc.SQLServerDriver`。 | | username | String | 否 | - | 连接实例的用户名 | | password | String | 否 | - | 连接实例的密码 | | query | String | 否 | - | 使用此 SQL 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 优先级更高。 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成 SQL 并接收上游输入数据写入数据库。此选项与 `query` 互斥,且优先级更高。 | | table | String | 否 | - | 使用 `database` 和此 `table-name` 自动生成 SQL 并接收上游输入数据写入数据库。此选项与 `query` 互斥,且优先级更高。 | | primary_keys | Array | 否 | - | 此选项用于在自动生成 SQL 时支持 `insert`、`delete` 和 `update` 等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接完成的数据库操作的等待时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败(executeBatch)的重试次数。 | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲的记录数达到 `batch_size` 或时间达到 `checkpoint.interval` 时,数据将被刷新到数据库中。 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,将使用 Xa 事务。如果启用,需要设置 `xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成 SQL 语句。 | | xa_data_source_class_name | String | 否 | - | 数据库驱动的 XA 数据源类名,例如 SQL Server 为 `com.microsoft.sqlserver.jdbc.SQLServerXADataSource`,其他数据源请参考附录。 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数。 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时时间,默认为 -1(永不超时)。注意:设置超时可能会影响精确一次语义。 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交。 | | common-options | | 否 | - | 接收器插件通用参数,详情请参考 [Sink Common Options](../common-options/sink-common-options.md)。 | | enable_upsert | Boolean | 否 | true | 通过主键存在启用 upsert。如果任务中没有键重复数据,将此参数设置为 `false` 可以加快数据导入速度。 | ## 提示 > 如果未设置 `partition_column`,将以单并发运行;如果设置了 `partition_column`,将根据任务的并发度并行执行。 ## 任务示例 ### 简单示例 > 这是一个读取 SQL Server 数据并直接插入到另一个表的示例 ``` env { # 可以在此设置引擎配置 parallelism = 10 } source { # 这是一个示例源插件,**仅用于测试和演示功能** Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from column_type_test.dbo.full_types_jdbc" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的源插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source/Jdbc } transform { # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms/sql } sink { Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" } # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink/Jdbc } ``` ### CDC(变更数据捕获)事件 > 我们也支持 CDC 变更数据。在这种情况下,需要配置 `database`、`table` 和 `primary_keys`。 ``` Jdbc { plugin_input = "customers" driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" generate_sink_sql = true database = "column_type_test" table = "dbo.full_types_sink" batch_size = 100 primary_keys = ["id"] } ``` ### 精确一次接收器 > 事务性写入可能较慢,但数据更准确 ``` Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "insert into full_types_jdbc_sink( id, val_char, val_varchar, val_text, val_nchar, val_nvarchar, val_ntext, val_decimal, val_numeric, val_float, val_real, val_smallmoney, val_money, val_bit, val_tinyint, val_smallint, val_int, val_bigint, val_date, val_time, val_datetime2, val_datetime, val_smalldatetime ) values( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" is_exactly_once = "true" xa_data_source_class_name = "com.microsoft.sqlserver.jdbc.SQLServerXADataSource" } # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink/Jdbc ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/StarRocks.md ================================================ import ChangeLog from '../changelog/connector-starrocks.md'; # StarRocks > StarRocks 数据接收器 ## 引擎支持 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## 描述 该接收器用于将数据写入到StarRocks中。支持批和流两种模式。 StarRocks数据接收器内部实现采用了缓存,通过stream load将数据批导入。 ## 依赖 ### 对于 Spark/Flink > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/lib/`. ## 接收器选项 | 名称 | 类型 | 是否必须 | 默认值 | Description | |-----------------------------|---------|------|------------------------------|---------------------------------------------------------------------------------------------------------------------| | nodeUrls | list | yes | - | `StarRocks`集群地址, 格式为 `["fe_ip:fe_http_port", ...]` | | base-url | string | yes | - | JDBC URL样式的连接信息。如:`jdbc:mysql://localhost:9030/` 或 `jdbc:mysql://localhost:9030` 或 `jdbc:mysql://localhost:9030/db` | | username | string | yes | - | 目标`StarRocks` 用户名 | | password | string | yes | - | 目标`StarRocks` 密码 | | database | string | yes | - | 指定目标 StarRocks 表所在的数据库的名称 | | table | string | no | - | 指定目标 StarRocks 表的名称, 如果没有设置该值,则表名与上游表名相同 | | labelPrefix | string | no | - | StarRocks stream load作业标签前缀 | | batch_max_rows | long | no | 1024 | 在批写情况下,当缓冲区数量达到`batch_max_rows`数量或`batch_max_bytes`字节大小或者时间达到`checkpoint.interval`时,数据会被刷新到StarRocks | | batch_max_bytes | int | no | 5 * 1024 * 1024 | 在批写情况下,当缓冲区数量达到`batch_max_rows`数量或`batch_max_bytes`字节大小或者时间达到`checkpoint.interval`时,数据会被刷新到StarRocks | | max_retries | int | no | - | 数据写入StarRocks失败后的重试次数 | | retry_backoff_multiplier_ms | int | no | - | 用作生成下一个退避延迟的乘数 | | max_retry_backoff_ms | int | no | - | 向StarRocks发送重试请求之前的等待时长 | | enable_upsert_delete | boolean | no | false | 是否开启upsert/delete事件的同步,仅仅支持主键模型的表 | | save_mode_create_template | string | no | 参见表下方的说明 | 参见表下方的说明 | | starrocks.config | map | no | - | stream load `data_desc`参数 | | http_socket_timeout_ms | int | no | 180000 | http socket超时时间,默认为3分钟 | | schema_save_mode | Enum | no | CREATE_SCHEMA_WHEN_NOT_EXIST | 在同步任务打开之前,针对目标端已存在的表结构选择不同的处理方法 | | data_save_mode | Enum | no | APPEND_DATA | 在同步任务打开之前,针对目标端已存在的数据选择不同的处理方法 | | custom_sql | String | no | - | 当data_save_mode设置为CUSTOM_PROCESSING时,必须同时设置CUSTOM_SQL参数。CUSTOM_SQL的值为可执行的SQL语句,在同步任务开启前SQL将会被执行 | ### save_mode_create_template StarRocks数据接收器使用模板,在需求需要的时候也可以修改模板,并结合上游数据类型和结构生成表的创建语句来自动创建StarRocks表。当前仅在多表模式下有效。 默认模板如下: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP PRIMARY KEY (${rowtype_primary_key}) COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key})PROPERTIES ( "replication_num" = "1" ) ``` 在模板中添加自定义字段,比如说加上`id`字段的修改模板如下: ```sql CREATE TABLE IF NOT EXISTS `${database}`.`${table_name}` ( id, ${rowtype_fields} ) ENGINE = OLAP COMMENT '${comment}' DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1" ); ``` StarRocks数据接收器根据上游数据自动获取相应的信息来填充模板,并且会移除`rowtype_fields`中的id字段信息。使用此方法可用来为自定义字段修改类型及相关属性。 可以使用的占位符有: - database: 上游数据模式的库名称 - table_name: 上游数据模式的表名称 - rowtype_fields: 上游数据模式的所有字段信息,连接器会将字段信息自动映射到StarRocks对应的类型 - rowtype_primary_key: 上游数据模式的主键信息,结果可能是列表 - rowtype_unique_key: 上游数据模式的唯一键信息,结果可能是列表 - comment: 上游数据模式的注释信息 ### table [string] 使用选项参数`database`和`table-name`自动生成SQL,并接收上游输入数据写入StarRocks中。 此选项与 `query` 是互斥的,具具有更高的优先级。 table选项参数可以填入一任意表名,这个名字最终会被用作目标表的表名,并且支持变量(`${table_name}`,`${schema_name}`)。 替换规则如下:`${schema_name}` 将替换传递给目标端的 SCHEMA 名称,`${table_name}` 将替换传递给目标端的表名。 例如: 1. test_${schema_name}_${table_name}_test 2. sink_sinktable 3. ss_${table_name} ### schema_save_mode [Enum] 在同步任务打开之前,针对目标端已存在的表结构选择不同的处理方法。可选值有: `RECREATE_SCHEMA` :不存在的表会直接创建,已存在的表会删除并根据参数重新创建 `CREATE_SCHEMA_WHEN_NOT_EXIST` :忽略已存在的表,不存在的表会直接创建 `ERROR_WHEN_SCHEMA_NOT_EXIST` :当有不存在的表时会直接报错 `IGNORE` :忽略对表的处理 ### data_save_mode [Enum] 在同步任务打开之前,针对目标端已存在的数据选择不同的处理方法。可选值有: `DROP_DATA`: 保存数据库结构,但是会删除表中存量数据 `APPEND_DATA`:保存数据库结构和相关的表存量数据 `CUSTOM_PROCESSING`:自定义处理 `ERROR_WHEN_DATA_EXISTS`:当对应表存在数据时直接报错 ### custom_sql [String] 当data_save_mode设置为CUSTOM_PROCESSING时,必须同时设置CUSTOM_SQL参数。CUSTOM_SQL的值为可执行的SQL语句,在同步任务开启前SQL将会被执行。 ## 数据类型映射 | StarRocks数据类型 | SeaTunnel数据类型 | |---------------|---------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | DATE | STRING | | TIME | STRING | | DATETIME | STRING | | STRING | STRING | | ARRAY | STRING | | MAP | STRING | | BYTES | STRING | #### 支持导入的数据格式 StarRocks数据接收器支持的格式有CSV和JSON格式。 ## 任务示例 ### 简单示例 > 接下来给出一个示例,该示例包含多种数据类型的数据写入,且用户需要为目标端下游创建相应表 ```hocon env { parallelism = 1 job.mode = "BATCH" checkpoint.interval = 10000 } source { FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(16, 1)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "JSON" strip_outer_array = true } } } ``` ### 支持写入cdc变更事件(INSERT/UPDATE/DELETE)示例 ```hocon sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" ... // 支持upsert/delete事件的同步(需要将选项参数enable_upsert_delete设置为true),仅支持表引擎为主键模型 enable_upsert_delete = true } } ``` ### JSON格式数据导入示例 ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "JSON" strip_outer_array = true } } } ``` ### CSV格式数据导入示例 ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { format = "CSV" column_separator = "\\x01" row_delimiter = "\\x02" } } } ``` ### 使用save_mode的示例 ``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] base-url = "jdbc:mysql://e2e_starRocksdb:9030/" username = root password = "" database = "test" table = "test_${schema_name}_${table_name}" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode="APPEND_DATA" batch_max_rows = 10 starrocks.config = { format = "CSV" column_separator = "\\x01" row_delimiter = "\\x02" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/TDengine.md ================================================ import ChangeLog from '../changelog/connector-tdengine.md'; # TDengine > TDengine 数据接收器 ## 描述 用于将数据写入TDengine。 ## 主要特性 - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 是否必传 | 默认值 | |----------|--------|----------|---------------| | url | string | 是 | - | | username | string | 是 | - | | password | string | 是 | - | | database | string | 是 | | | stable | string | 是 | - | | timezone | string | 否 | UTC | | write_columns | list | 否 | - | ### url [string] TDengine的url 例如 ``` jdbc:TAOS-RS://localhost:6041/ ``` ### username [string] TDengine的用户名 ### password [string] TDengine的密码 ### database [string] TDengine的数据库 ### stable [string] TDengine的超级表 ### timezone [string] TDengine服务器的时间,对ts字段很重要 ### write_columns [list] TDengine的写入列,默认为所有列。无需包含 TAGS 字段,插件会自动处理 TAGS 字段的写入。 ## 示例 ### sink ```hocon sink { TDengine { url : "jdbc:TAOS-RS://localhost:6041/" username : "root" password : "taosdata" database : "power2" stable : "meters2" timezone: UTC write_columns: ["ts", "voltage", "current", "power"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Tablestore.md ================================================ import ChangeLog from '../changelog/connector-tablestore.md'; # Tablestore > Tablestore 数据接收器 ## 描述 用于将数据写入 Tablestore ## 主要特性 - [ ] [exactly-once](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 是否必填 | 默认值 | |-------------------|--------|----------|---------------| | end_point | string | 是 | - | | instance_name | string | 是 | - | | access_key_id | string | 是 | - | | access_key_secret | string | 是 | - | | table | string | 是 | - | | primary_keys | array | 是 | - | | batch_size | string | 否 | 25 | | common-options | config | 否 | - | ### end_point [string] endPoint 用于写入Tablestore。 ### instanceName [string] Tablestore 的实例名称。 ### access_key_id [string] Tablestore 访问的id。 ### access_key_secret [string] Tablestore 访问的密钥。 ### table [string] Tablestore的表。 ### primaryKeys [array] Tablestore 的主键。 ### common 选项 [ config ] Sink插件常用参数,请参考[Sink common Options](../common-options/sink-common-options.md)了解详细信息。 ## 示例 ```bash Tablestore { end_point = "xxxx" instance_name = "xxxx" access_key_id = "xxxx" access_key_secret = "xxxx" table = "sink" primary_keys = ["pk_1","pk_2","pk_3","pk_4"] } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Typesense.md ================================================ import ChangeLog from '../changelog/connector-typesense.md'; # Typesense ## 描述 输出数据到 `Typesense` ## 主要特性 - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [cdc](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | |------------------|--------|------|------------------------------| | hosts | array | 是 | - | | collection | string | 是 | - | | schema_save_mode | string | 是 | CREATE_SCHEMA_WHEN_NOT_EXIST | | data_save_mode | string | 是 | APPEND_DATA | | primary_keys | array | 否 | | | key_delimiter | string | 否 | `_` | | api_key | string | 否 | | | max_retry_count | int | 否 | 3 | | max_batch_size | int | 否 | 10 | | common-options | | 否 | - | ### hosts [array] Typesense的访问地址,格式为 `host:port`,例如:["typesense-01:8108"] ### collection [string] 要写入的集合名,例如:“seatunnel” ### primary_keys [array] 主键字段用于生成文档 `id`。 ### key_delimiter [string] 设定复合键的分隔符(默认为 `_`)。 ### api_key [config] typesense 安全认证的 api_key。 ### max_retry_count [int] 批次批量请求最大尝试大小 ### max_batch_size [int] 批次批量文档最大大小 ### common options Sink插件常用参数,请参考 [Sink常用选项](../common-options/sink-common-options.md) 了解详情 ### schema_save_mode 在启动同步任务之前,针对目标侧已有的表结构选择不同的处理方案
    选项介绍:
    `RECREATE_SCHEMA` :当表不存在时会创建,当表已存在时会删除并重建
    `CREATE_SCHEMA_WHEN_NOT_EXIST` :当表不存在时会创建,当表已存在时则跳过创建
    `ERROR_WHEN_SCHEMA_NOT_EXIST` :当表不存在时将抛出错误
    ### data_save_mode 在启动同步任务之前,针对目标侧已存在的数据选择不同的处理方案
    选项介绍:
    `DROP_DATA`: 保留数据库结构,删除数据
    `APPEND_DATA`:保留数据库结构,保留数据
    `ERROR_WHEN_DATA_EXISTS`:当有数据时抛出错误
    ## 示例 简单示例 ```bash sink { Typesense { plugin_input = "typesense_test_table" hosts = ["localhost:8108"] collection = "typesense_to_typesense_sink_with_query" max_retry_count = 3 max_batch_size = 10 api_key = "xyz" primary_keys = ["num_employees","id"] key_delimiter = "=" schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/sink/Vertica.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Vertica > JDBC Vertica Sink 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过 JDBC 写入数据。支持批处理和流处理模式,支持并发写入,支持精确一次语义(使用 XA 事务保证)。 ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 需要确保 [jdbc 驱动 jar 包](https://www.vertica.com/download/vertica/client-drivers/) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 需要确保 [jdbc 驱动 jar 包](https://www.vertica.com/download/vertica/client-drivers/) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要特性 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [cdc](../../introduction/concepts/connector-v2-features.md) > 使用 `Xa 事务` 来保证 `精确一次`。因此仅支持支持 `Xa 事务` 的数据库。可以通过设置 `is_exactly_once=true` 来启用。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动类名 | URL 格式 | Maven 依赖 | |-----------|--------------------------------|------------------------------|--------------------------------------|---------------------------------------------------------------------------------------------| | Vertica | 不同依赖版本有不同的驱动类名 | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [下载](https://www.vertica.com/download/vertica/client-drivers/) | ## 数据库依赖 > 请下载支持列表中对应的 'Maven' 依赖,并将其复制到 `$SEATUNNEL_HOME/plugins/jdbc/lib/` 工作目录中。
    > 例如 Vertica 数据源:`cp vertica-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/` ## 数据类型映射 | Vertica 数据类型 | SeaTunnel 数据类型 | |------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------| | BIT(1)
    INT UNSIGNED | BOOLEAN | | TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(获取指定列的列大小 <38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的列大小 >38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((获取指定列的列大小)+1,
    (获取指定列的小数点右侧的位数))) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | | GEOMETRY
    UNKNOWN | 尚未支持 | ## 接收器选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |------------------------------|---------|----------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:`jdbc:vertica://localhost:5433/vertica` | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,如果使用 Vertica,值为 `com.vertica.jdbc.Driver`。 | | username | String | 否 | - | 连接实例的用户名 | | password | String | 否 | - | 连接实例的密码 | | query | String | 否 | - | 使用此 SQL 将上游输入数据写入数据库。例如 `INSERT ...`,`query` 优先级更高。 | | database | String | 否 | - | 使用此 `database` 和 `table-name` 自动生成 SQL 并接收上游输入数据写入数据库。此选项与 `query` 互斥,且优先级更高。 | | table | String | 否 | - | 使用 `database` 和此 `table-name` 自动生成 SQL 并接收上游输入数据写入数据库。此选项与 `query` 互斥,且优先级更高。 | | primary_keys | Array | 否 | - | 此选项用于在自动生成 SQL 时支持 `insert`、`delete` 和 `update` 等操作。 | | connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接完成的数据库操作的等待时间(秒)。 | | max_retries | Int | 否 | 0 | 提交失败(executeBatch)的重试次数。 | | batch_size | Int | 否 | 1000 | 对于批量写入,当缓冲的记录数达到 `batch_size` 或时间达到 `checkpoint.interval` 时,数据将被刷新到数据库中。 | | is_exactly_once | Boolean | 否 | false | 是否启用精确一次语义,将使用 Xa 事务。如果启用,需要设置 `xa_data_source_class_name`。 | | generate_sink_sql | Boolean | 否 | false | 根据要写入的数据库表生成 SQL 语句。 | | xa_data_source_class_name | String | 否 | - | 数据库驱动的 XA 数据源类名,例如 Vertica 为 `com.vertical.cj.jdbc.VerticalXADataSource`,其他数据源请参考附录。 | | max_commit_attempts | Int | 否 | 3 | 事务提交失败的重试次数。 | | transaction_timeout_sec | Int | 否 | -1 | 事务打开后的超时时间,默认为 -1(永不超时)。注意:设置超时可能会影响精确一次语义。 | | auto_commit | Boolean | 否 | true | 默认启用自动事务提交。 | | properties | Map | 否 | - | 额外的连接配置参数,当 properties 和 URL 中有相同的参数时,优先级由驱动的具体实现决定。例如,在 MySQL 中,properties 优先于 URL。 | | common-options | | 否 | - | 接收器插件通用参数,详情请参考 [Sink Common Options](../common-options/sink-common-options.md)。 | | enable_upsert | Boolean | 否 | true | 通过主键存在启用 upsert。如果任务中没有键重复数据,将此参数设置为 `false` 可以加快数据导入速度。 | ### 提示 > 如果未设置 `partition_column`,将以单并发运行;如果设置了 `partition_column`,将根据任务的并发度并行执行。 ## 任务示例 ### 简单示例 > 此示例定义了一个 SeaTunnel 同步任务,通过 FakeSource 自动生成数据并发送到 JDBC Sink。FakeSource 总共生成 16 行数据(row.num=16),每行有两个字段,name(字符串类型)和 age(int 类型)。最终目标表 test_table 中也将有 16 行数据。在运行此任务之前,您需要在 Vertica 中创建数据库 test 和表 test_table。如果您尚未安装和部署 SeaTunnel,请按照 [安装 SeaTunnel](../../getting-started/locally/deployment.md) 中的说明进行安装和部署。然后按照 [使用 SeaTunnel Engine 快速开始](../../getting-started/locally/quick-start-seatunnel-engine.md) 中的说明运行此任务。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { # 这是一个示例源插件,**仅用于测试和演示功能** FakeSource { parallelism = 1 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的源插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" } # 如果想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` ### 生成接收器 SQL > 此示例不需要编写复杂的 SQL 语句,您可以通过配置数据库名称和表名称自动生成插入语句。 ``` sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" username = "root" password = "123456" # 根据数据库表名自动生成 SQL 语句 generate_sink_sql = true database = test table = test_table } } ``` ### 精确一次 > 对于精确写入场景,我们保证精确一次语义。 ``` sink { jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" max_retries = 0 username = "root" password = "123456" query = "insert into test_table(name,age) values(?,?)" is_exactly_once = "true" xa_data_source_class_name = "com.vertical.cj.jdbc.VerticalXADataSource" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Airtable.md ================================================ import ChangeLog from '../changelog/connector-http-airtable.md'; # Airtable > Airtable 源连接器 ## 描述 用于从 Airtable 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | |--------|------|------|--------| | token | String | 是 | - | | base_id | String | 是 | - | | table | String | 是 | - | | api_base_url | String | 否 | https://api.airtable.com | | view | String | 否 | - | | fields | List | 否 | - | | filter_by_formula | String | 否 | - | | max_records | int | 否 | - | | page_size | int | 否 | - | | sort | String | 否 | - | | cell_format | String | 否 | - | | return_fields_by_field_id | boolean | 否 | - | | record_metadata | List | 否 | - | | time_zone | String | 否 | - | | user_locale | String | 否 | - | | request_interval_ms | int | 否 | 220 | | rate_limit_backoff_ms | int | 否 | 30000 | | rate_limit_max_retries | int | 否 | 3 | | schema | Config | 否 | - | | schema.fields | Config | 否 | - | | format | String | 否 | text | | content_field | String | 否 | - | | json_field | Config | 否 | - | | common-options | config | 否 | - | ### token [String] Airtable 个人访问令牌。可在 https://airtable.com/create/tokens 创建。 ### base_id [String] Airtable Base ID(以 `app` 开头)。 ### table [String] 要读取的表名或表 ID。 ### api_base_url [String] Airtable API 基础 URL,默认 `https://api.airtable.com`。 ### view [String] 视图名称或 ID,仅返回该视图中可见的记录。 ### fields [List] 要包含在响应中的字段名列表。 ### filter_by_formula [String] Airtable 公式表达式,用于过滤记录。参考 [Airtable 公式文档](https://support.airtable.com/docs/formula-field-reference)。 ### max_records [int] 返回的最大记录总数。 ### page_size [int] 每页记录数(1-100)。 ### sort [String] 排序定义 JSON 数组,例如 `[{"field":"Name","direction":"asc"}]`。 ### cell_format [String] 单元格值格式,`json` 或 `string`。 ### return_fields_by_field_id [boolean] 如果为 true,响应中的字段键将使用字段 ID 而非字段名。 ### record_metadata [List] 要返回的额外记录元数据,例如 `["commentCount"]`。 ### time_zone [String] 用于格式化日期/时间值的时区。 ### user_locale [String] 用于格式化值的用户区域设置。 ### request_interval_ms [int] API 请求之间的最小间隔(毫秒),默认 220ms(以保持在 Airtable 每秒 5 次请求的限制内)。 ### rate_limit_backoff_ms [int] 收到 429(限流)响应时的基础退避时间(毫秒),默认 30000ms。 ### rate_limit_max_retries [int] 收到 429 响应后的最大重试次数,默认 3。 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### format [String] 上游数据的格式,支持 `json` 和 `text`,默认 `text`。 ### content_field [String] 用于从响应中提取数据的 JsonPath 表达式。对于 Airtable,通常使用 `$.records[*].fields` 来提取每条记录的字段。 ### json_field [Config] 此参数帮助您配置模式,必须与 schema 一起使用。 ### common options 源插件通用参数,请参考 [Source Common Options](../common-options/source-common-options.md)。 ## 示例 读取 Airtable 表并输出原始文本: ```hocon source { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" format = "text" max_records = 10 } } ``` 指定 schema 并提取记录字段: ```hocon source { Airtable { token = "patXXXXXXXX.XXXXXXXX" base_id = "appXXXXXXXX" table = "Shipments" content_field = "$.records[*].fields" filter_by_formula = "{Status} = 'Shipped'" schema = { fields { Name = string Status = string Weight = float } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/AmazonDynamoDB.md ================================================ import ChangeLog from '../changelog/connector-amazondynamodb.md'; # AmazonDynamoDB > AmazonDynamoDB 源连接器 ## 描述 从 Amazon DynamoDB 读取数据. ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------------|--------|-------|---------------| | url | string | 是 | - | | region | string | 是 | - | | access_key_id | string | 是 | - | | secret_access_key | string | 是 | - | | table | string | 是 | - | | schema | config | 是 | - | | common-options | | 是 | - | | scan_item_limit | | 否 | - | | parallel_scan_threads | | 否 | - | ### url [string] 读取Amazon Dynamodb的URL. ### region [string] Amazon DynamoDB 的分区. ### access_key_id [string] Amazon DynamoDB的访问id. ### secret_access_key [string] Amazon DynamoDB的访问密钥. ### table [string] Amazon DynamoDB 的表名. ### schema [Config] #### fields [config] Amazon Dynamodb是一个支持键值存储和文档数据结构的NOSQL数据库服务,无法获取数据类型。因此,我们必须配置模式。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 例如: ``` schema { fields { id = int key_aa = string key_bb = string } } ``` ### common options 源插件常用参数,详见 [Source Plugin](../common-options/source-common-options.md) ### scan_item_limit 每个扫描请求应返回的项目数 ### parallel_scan_threads 并行扫描的逻辑段数 ## 例子 ```bash Amazondynamodb { url = "http://127.0.0.1:8000" region = "us-east-1" access_key_id = "dummy-key" secret_access_key = "dummy-secret" table = "TableName" schema = { fields { artist = string c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/AmazonSqs.md ================================================ import ChangeLog from '../changelog/connector-amazonsqs.md'; # AmazonSqs > AmazonSqs 源连接器 ## 支持一下引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 描述 从 Amazon SQS 读取数据. ## 源选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |-------------------------|--------|----|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | 从 Amazon SQ S读取的队列 URL. | | region | String | 否 | - | SQS 服务的 AWS 分区 | | schema | Config | 否 | - | 数据的结构,包括字段名和字段类型。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | format | String | 否 | json | 数据格式。默认格式为json。可选文本格式,canal-json和debezium-json。如果你使用json或text格式。默认字段分隔符为 ", ". 如果自定义分隔符,请添加"field_delimiter"选项。如果使用 canal 格式,详见[canal-json](../formats/canal-json.md).如果使用 debezium 格式,详见[debezium-json](../formats/debezium-json.md).. | | format_error_handle_way | String | 否 | fail | 数据格式错误的处理方法. 默认值为fail,可选值为(fail,skip). 当选择失败时,数据格式错误将被阻止,并引发异常. 当选择跳过时,数据格式错误将跳过此行数据. | | field_delimiter | String | 否 | , | 自定义数据格式的字段分隔符. | | common-options | | 否 | - | 源插件常用参数, 详见 [源通用选项](../common-options/source-common-options.md) | ## 任务示例 ```bash source { AmazonSqs { url = "http://127.0.0.1:4566" region = "us-east-1" format = text field_delimiter = "#" schema = { fields { artist = string c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # 如果你想了解更多关于如何配置seatunnel的信息,并查看转换插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Cassandra.md ================================================ import ChangeLog from '../changelog/connector-cassandra.md'; # Cassandra > Cassandra 源连接器 ## 描述 从 Apache Cassandra 读取数据. ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-------------------|--------|----|---------------| | host | String | 是 | - | | keyspace | String | 是 | - | | cql | String | 是 | - | | username | String | 否 | - | | password | String | 否 | - | | datacenter | String | 否 | datacenter1 | | consistency_level | String | 否 | LOCAL_ONE | ### host [string] `Cassandra` 的集群地址, 格式为 `host:port` , 允许指定多个 `hosts` . 例如 `"cassandra1:9042,cassandra2:9042"`. ### keyspace [string] `Cassandra` 的键空间. ### cql [String] 查询cql,用于通过Cassandra会话搜索数据. ### username [string] `Cassandra` 用户的用户名. ### password [string] `Cassandra` 用户的密码. ### datacenter [String] `Cassandra` 数据中心, 默认为 `datacenter1`. ### consistency_level [String] `Cassandra` 的写入一致性级别, 默认为 `LOCAL_ONE`. ## 示例 ```hocon source { Cassandra { host = "localhost:9042" username = "cassandra" password = "cassandra" datacenter = "datacenter1" keyspace = "test" cql = "select * from source_table" plugin_output = "source_table" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Clickhouse.md ================================================ import ChangeLog from '../changelog/connector-clickhouse.md'; # Clickhouse > Clickhouse source 连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 核心特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列映射](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表读](../../introduction/concepts/connector-v2-features.md) > 支持查询SQL,可以实现投影效果。 ## 描述 用于从Clickhouse读取数据。 ## 支持的数据源信息 为了使用 Clickhouse 连接器,需要以下依赖项。它们可以通过 install-plugin.sh 或从 Maven 中央存储库下载。 | 数据源 | 支持的版本 | 依赖 | |------------|--------------------|------------------------------------------------------------------------------------------| | Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-clickhouse) | ## 数据类型映射 | Clickhouse 数据类型 | SeaTunnel 数据类型 | |-----------------------------------------------------------------------------------------------------------------------------------------------|---------------------| | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | STRING | | Int8 / UInt8 / Int16 / UInt16 / Int32 | INT | | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | BIGINT | | Float64 | DOUBLE | | Decimal | DECIMAL | | Float32 | FLOAT | | Date | DATE | | DateTime | TIME | | Array | ARRAY | | Map | MAP | ## Source 选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |-------------------|--------|----------|------------------------|-----------------------------------------------------------------------------------| | host | String | 是 | - | `ClickHouse` 集群地址, 格式是`host:port` , 允许多个`hosts`配置. 例如 `"host1:8123,host2:8123"` . | | username | String | 是 | - | `ClickHouse` user 用户账号. | | password | String | 是 | - | `ClickHouse` user 用户密码. | | table_list | Array | NO | - | 要读取的数据表列表,支持配置多表. | | clickhouse.config | Map | 否 | - | 除了上述必须由 `clickhouse-jdbc` 指定的必填参数外,用户还可以指定多个可选参数,这些参数涵盖了 `clickhouse-jdbc` 提供的所有[参数](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration). | | server_time_zone | String | 否 | ZoneId.systemDefault() | 数据库服务中的会话时区。如果未设置,则使用ZoneId.systemDefault()设置服务时区. | | common-options | | 否 | - | 源插件常用参数,详见 [源通用选项](../common-options/source-common-options.md). | 多表配置: | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------|--------|------|------|--------------------------------------------------------------------------------------| | table_path | String | 否 | - | 数据表的完整路径, 例如: `default.table`. | | sql | String | 否 | - | 用于通过Clickhouse服务搜索数据的查询sql. | | filter_query | String | 否 | - | 数据过滤条件. 格式为: "field = value", 例如 : filter_query = "id > 2 and type = 1" | | partition_list | Array | 否 | - | 指定分区列表过滤数据. 如果是分区表,该字段可以配置为过滤指定分区的数据。. 例如: partition_list = ["20250615", "20250616"] | | batch_size | int | 否 | 1024 | 从Clickhouse读取一次可以获得的最大数据行数。 | 注意: 当此配置对应于单个表时,您可以将table_list中的配置项展平到外层。 ## 并行读取 Clickhouse源连接器支持并行读取数据。 当仅指定`table_path`参数时,连接器根据从`system.parts`系统表中获取的数据表的part文件实现并行读取。 当仅指定`sql`参数时,连接器在集群的每个分片上基于本地表执行查询来实现并发读取。如果`sql`参数指定了一个分布式表,则会根据分布式表引擎的集群名获取分片列表执行并发读取。如果`sql`指定了一个本地表,那么`host`参数配置的节点列表将被视作集群分片列表执行并发读取。 如果同时设置了`table_path`和`sql`参数,则将在sql模式下执行。推荐在指定`sql`参数时同时配置`table_path`参数以更好地识别表的元数据。 ## Tips 当指定`table_path`参数时,如果不想读取整个表,可以指定`partition_list`或`filter_query`参数过滤指定条件或分区的数据。 * `partition_list`: 过滤指定分区的数据 * `filter_query`: 根据指定条件对数据进行过滤 `batch_size`参数可用于控制每次查询读取的数据量,以避免在读取大量数据时出现OOM异常。适当增加这个值将有助于提高读取过程的性能。 当读取单个表的数据时,建议使用`table_path`参数替代`sql`参数。 ## 如何创建Clickhouse数据同步作业 ### 单表配置 下面的示例演示了如何创建一个数据同步作业,该作业从Clickhouse读取数据并在本地客户端上打印数据 **案例1:基于part文件读取策略的并行读取** ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_path = "default.table" server_time_zone = "UTC" partition_list = ["20250615", "20250616"] filter_query = "id > 2 and type = 1" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` **案例2:基于SQL读取策略的并行读取** > 注意:SQL模式下的并行读取方式目前仅支持单表和where条件查询 ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_path = "default.table" server_time_zone = "UTC" sql = "select * from default.table where id > 2 and type = 1" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` **案例3:针对复杂SQL场景的单并发读取** 当执行复杂SQL查询场景(例如带有join、group by、子查询等的查询)时,连接器将自动切换到单并发执行方式,即使配置了更高的并行度值。 ```hocon env { job.mode = "BATCH" parallelism = 1 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" server_time_zone = "UTC" sql = "select t1.id, t2.category from default.table1 t1 global join default.table2 t2 on t1.id = t2.id where t1.age > 18" batch_size = 1024 clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` ### 多表配置 ```hocon env { job.mode = "BATCH" parallelism = 5 } source { Clickhouse { host = "localhost:8123" username = "xxx" password = "xxx" table_list = [ { table_path = "default.table1" sql = "select * from default.table1 where id > 2 and type = 1" }, { table_path = "default.table2" sql = "select * from default.table2 where age > 18" } ] server_time_zone = "UTC" clickhouse.config = { "socket_timeout": "300000" } } } # Console printing of the read Clickhouse data sink { Console { parallelism = 1 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Cloudberry.md ================================================ import ChangeLog from '../changelog/connector-cloudberry.md'; # Cloudberry > JDBC Cludberry源连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 使用依赖关系 ### 适用于 Spark/Flink 引擎 > 1. 您需要确保[jdbc驱动程序jar包](https://mvnrepository.com/artifact/org.postgresql/postgresql)已放置在目录`${SEATUNNEL_HOME}/plugins/`中。 ### 适用于 SeaTunnel Zeta 引擎 > 1. 您需要确保[jdbc驱动程序jar包](https://mvnrepository.com/artifact/org.postgresql/postgresql)已放置在目录`${SEATUNNEL_HOME}/lib/`中。 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列映射](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) > 支持查询SQL,可以实现映射效果。 ## 描述 通过 JDBC 读取外部数据源的数据。Cloudberry 暂未提供原生 JDBC 的驱动,需使用 PostgreSQL的 驱动程序和实现。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动程序 | URL | Maven | | :--------- | :----------------------- | :---------------------- | :-------------------------------------- | :----------------------------------------------------------- | | Cloudberry | 使用 PostgreSQL 驱动实现 | `org.postgresql.Driver` | `jdbc:postgresql://localhost:5432/test` | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | ## 数据库相关性 > 请下载PostgreSQL驱动程序的jar包,并将其复制到`${SEATUNNEL_HOME}/plugins/jdbc/lib/`工作目录下。
    > 例如:`cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/` ## 数据类型映射 Cloudberry 使用 PostgreSQL 的数据类型实现。有关数据类型的兼容性和映射关系,请参考 PostgreSQL 文档。 ## 配置项 Cloudberry 连接器使用与 PostgreSQL 相同的配置项。有关详细的配置选项,请参考 PostgreSQL 连接器文档。 关键配置项包括: - url (必需): JDBC 连接 URL。 - driver (必需): 驱动程序类名 (org.postgresql.Driver)。 - user/password: 认证凭据。 - query or table_path: 要读取的数据。 - 用于并行读取的分区选项。 ## 并行读取 Cloudberry 支持与 PostgreSQL 连接器相同的并行读取规则。有关切片策略和并行读取选项的详细信息,请参考 PostgreSQL 连接器文档。 ## 任务示例 ### 简单 ```hocon env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" query = "select * from mytable limit 100" } } sink { Console {} } ``` ### 使用 table_path 进行并行读取 ```hocon env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" table_path = "public.mytable" split.size = 10000 } } sink { Console {} } ``` ### 读取多张表 ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:postgresql://localhost:5432/cloudberrydb" driver = "org.postgresql.Driver" user = "dbadmin" password = "password" "table_list" = [ { "table_path" = "public.table1" }, { "table_path" = "public.table2" } ] split.size = 10000 } } sink { Console {} } ``` 有关更详细的示例和配置,请参阅PostgreSQL连接器文档。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/CosFile.md ================================================ import ChangeLog from '../changelog/connector-file-cos.md'; # CosFile > CosFile source 连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在pollNext调用中读取拆分的所有数据。读取的拆分内容将保存在快照中。 - [x] [列映射](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 从阿里云Cos文件系统读取数据。 :::提示 如果你使用spark/flink,为了使用这个连接器,你必须确保你的spark/flilk集群已经集成了hadoop。测试的hadoop版本是2.x 如果你使用SeaTunnel Engine,当你下载并安装SeaTunnel引擎时,它会自动集成hadoop jar。您可以在${SEATUNNEL_HOME}/lib下检查jar包以确认这一点. 要使用此连接器,您需要将hadoop-cos-{hadoop.version}-{version}.jar和cos_api-bundle-{version}.jar位于${SEATUNNEL_HOME}/lib目录中,下载:[Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). 它只支持hadoop 2.6.5+和8.0.2版本+. ::: ## 选项 | 名称 | 类型 | 必需 | 默认值 | |----------------------------|---------|----|---------------------| | path | string | 是 | - | | file_format_type | string | 是 | - | | bucket | string | 是 | - | | secret_id | string | 是 | - | | secret_key | string | 是 | - | | region | string | 是 | - | | read_columns | list | 是 | - | | delimiter/field_delimiter | string | 否 | \001 | | row_delimiter | string | 否 | \n | | parse_partition_from_path | boolean | 否 | true | | skip_header_row_number | long | 否 | 0 | | date_format | string | 否 | yyyy-MM-dd | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | | time_format | string | 否 | HH:mm:ss | | schema | config | 否 | - | | sheet_name | string | 否 | - | | xml_row_tag | string | 否 | - | | xml_use_attr_format | boolean | 否 | - | | csv_use_header_line | boolean | 否 | false | | file_filter_pattern | string | 否 | | | compress_codec | string | 否 | none | | archive_compress_codec | string | 否 | none | | encoding | string | 否 | UTF-8 | | binary_chunk_size | int | 否 | 1024 | | binary_complete_file_mode | boolean | 否 | false | | common-options | | 否 | - | | file_filter_modified_start | string | 否 | - | | file_filter_modified_end | string | 否 | - | | quote_char | string | 否 | " | | escape_char | string | 否 | - | ### path [string] 源文件路径。 ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型设置为“json”,您还应该分配模式选项,告诉连接器如何将数据解析到所需的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您还可以将多条数据保存在一个文件中,并按换行符拆分它们: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` 您应该按如下方式设置schema架构: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将按如下方式生成数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 如果您将文件类型指定为“parquet” “orc”,则不需要模式选项,连接器可以自动找到上游数据的模式。 如果将文件类型指定为“text” “csv”,则可以选择是否指定schema架构信息。 例如,上游数据如下: ```text tyrantlucifer#26#male ``` 如果不指定数据schema模式,连接器将按如下方式处理上游数据: | content | |-----------------------| | tyrantlucifer#26#male | 如果指定数据模式,除了CSV文件类型外,还应指定“field_delimiter”选项 您应该按如下方式分配模式和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将按如下方式生成数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | 如果将文件类型指定为“二进制”,SeaTunnel可以同步任何格式的文件, 例如压缩包、图片等。简而言之,任何文件都可以同步到目标位置。 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 根据此要求,您需要确保源端和目标端使用“二进制”格式进行文件同步同时。您可以在下面的示例中找到具体用法。 ### bucket [string] Cos文件系统的bucket地址,例如: `cos://tyrantlucifer-image-bed` ### secret_id [string] Cos文件系统的秘密id。 ### secret_key [string] Cos文件系统的密钥。 ### region [string] cos文件系统的region。 ### read_columns [list] 读取数据源的列的列表,用户可以使用它来实现字段映射。 ### delimiter/field_delimiter [string] **delimiter** 参数在2.3.5版本后将弃用,请改用**field_delimiter**。 仅当file_format为文本时才需要配置。 字段分隔符,用于告诉连接器如何对字段进行切片和切块 默认值“\001”,与配置单元的默认分隔符相同 ### row_delimiter [string] 仅在 file_format 为 text 时需要配置。 行分隔符,用于告诉连接器如何分割行。 默认 `\n`。 ### parse_partition_from_path [boolean] 控制是否从文件路径解析分区键和值 例如,如果从路径读取文件`cosn://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` 文件中的每个记录数据都将添加这两个字段: | name | age | |---------------|-----| | tyrantlucifer | 26 | 提示:**不要在schema选项中定义分区字段** ### skip_header_row_number [long] 跳过前几行,但仅限于txt和csv。 例如,设置如下: `skip_header_row_number = 2` 那么SeaTunnel将跳过源文件的前两行 ### date_format [string] 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` ### datetime_format [string] Datetime类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` default `yyyy-MM-dd HH:mm:ss` ### time_format [string] 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式: `HH:mm:ss` `HH:mm:ss.SSS` default `HH:mm:ss` ### schema [config] 仅当file_format_type为文本、json、excel、xml或csv(或我们无法从元数据中读取模式的其他格式)时才需要配置。 #### fields [Config] 上游数据的schema。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### sheet_name [string] 仅当file_format为excel时才需要配置。 阅读工作簿的纸张。 ### xml_row_tag [string] 仅当file_format为xml时才需要配置。 指定XML文件中数据行的标记名称。 ### xml_use_attr_format [boolean] 仅当file_format为xml时才需要配置。 指定是否使用标记属性格式处理数据。 ### csv_use_header_line [boolean] 仅在文件格式为 csv 时可以选择配置。 是否使用标题行来解析文件, 标题行 与 RFC 4180 匹配 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参阅https://en.wikipedia.org/wiki/Regular_expression. 有一些例子。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例1**:*匹配所有.txt文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果为: ``` /data/seatunnel/20241001/report.txt ``` **示例2**:*匹配所有以abc*开头的文件,正则表达式: ``` abc.* ``` 此示例匹配的结果为: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果为: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例4**:*匹配以202410开头的三级文件夹和以.csv*结尾的文件,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果为: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### compress_codec [string] 文件的压缩编解码器和支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器和支持的详细信息如下所示: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz压缩的excel文件需要压缩原始文件或指定文件后缀,如e2e.xls->e2e_test.xls.gz ### encoding [string] 仅当file_format_type为json、text、csv、xml时使用。 要读取的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### binary_chunk_size [int] 仅在 file_format_type 为 binary 时使用。 读取二进制文件的块大小(以字节为单位)。默认为 1024 字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在 file_format_type 为 binary 时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为 false。 ### file_filter_modified_start 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### file_filter_modified_end 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### common options 源插件常用参数,详见[源端通用选项](../common-options/source-common-options.md)。 ## 例如 ```hocon CosFile { path = "/seatunnel/orc" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "orc" } ``` ```hocon CosFile { path = "/seatunnel/json" bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" file_format_type = "json" schema { fields { id = int name = string } } } ``` ### 传输二进制文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // 您可以将本地文件传输到s3/hdfs/oss等。 CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### Filter File ```hocon env { parallelism = 1 job.mode = "BATCH" } source { CosFile { bucket = "cosn://seatunnel-test-1259587829" secret_id = "xxxxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxxxx" region = "ap-chengdu" path = "/seatunnel/read/binary/" file_format_type = "binary" // file example abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/DB2.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DB2 > JDBC DB2 Source连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 通过JDBC读取外部数据源数据。 ## 使用依赖关系 ### 适用于 Spark/Flink 引擎 > 1. 您需要确保[jdbc驱动程序jar包](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc)已放置在目录`${SEATUNNEL_HOME}/plugins/`中。 ### 适用于 SeaTunnel Zeta 引擎 > 1. 您需要确保[jdbc驱动程序jar包](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc)已放置在目录“${SEATUNNEL_HOME}/lib/”中。 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列映射](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) > 支持查询SQL,可以实现映射效果。 ## 支持的数据源信息 | 数据源 | 支持版本 | 驱动 | Url | Maven | |------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| | DB2 | 不同的依赖版本有不同的驱动程序类。| com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [下载](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | ## 数据库相关性 > 请下载“Maven”对应的支持列表,并将其复制到“$SEATUNNEL_HOME/plugins/jdbc/lib/”工作目录
    > 例如,DB2数据源:cp DB2-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | DB2数据类型 | SeaTunnel 数据类型 | |------------------------------------------------------------------------------------------------------|---------------------|---| | BOOLEAN | BOOLEAN | | SMALLINT | SHORT | | INT
    INTEGER
    | INTEGER | | BIGINT | LONG | | DECIMAL
    DEC
    NUMERIC
    NUM | DECIMAL(38,18) | | REAL | FLOAT | | FLOAT
    DOUBLE
    DOUBLE PRECISION
    DECFLOAT | DOUBLE | | CHAR
    VARCHAR
    LONG VARCHAR
    CLOB
    GRAPHIC
    VARGRAPHIC
    LONG VARGRAPHIC
    DBCLOB | STRING | | BLOB | BYTES | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | ROWID
    XML | Not supported yet | ## 源选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC连接的URL。请参考案例:jdbc:db2://127.0.0.1:50000/dbname | | driver | String | 是 | - | 用于连接到远程数据源的jdbc类名,
    如果使用db2,则值为`com.ibm.db2.jdbc.app.DB2Driver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 并行分区的列名,只支持数值类型,只支持数字类型主键,只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | 扫描的partition_column最小值,如果未设置,SeaTunnel将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | 扫描的partition_column最大值,如果没有设置,SeaTunnel将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分区计数的数量,只支持正整数。默认值是作业并行性 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,通过减少满足选择条件所需的数据库请求次数来提高性能。0表示使用jdbc默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当属性和URL具有相同的参数时,优先级由驱动程序的特定实现决定。例如,在MySQL中,属性优先于URL。 | | common-options | | 否 | - | source插件常用参数,详见[Source common Options](../common-options/source-common-options.md) | ### 小贴士 > 如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发度并行执行。 ## 任务示例 ### 简单 > 此示例以单并行方式在您的测试“database”中查询类型容器(type_bin)'table'的16条数据。并查询其所有字段。您还可以指定要查询哪些字段以将最终输出到控制台。 ``` # 定义运行时环境 env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from table_xxx" } } transform { # 如果你想了解更多关于如何配置seatunnel的信息,并查看transform插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 并行 > 并行读取您的查询表,利用您配置的分片字段以及分片数据。若您希望读取整个表,您可以采取此操作。 ``` source { Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需要定义查询逻辑 query = "select * from type_bin" # 并行分片读取字段 partition_column = "id" # 碎片数量 partition_num = 10 } } ``` ### 并行的同时指定边界 > 在查询的上下界范围内指定数据更为高效。根据您配置的上下边界读取数据源,效率更佳。 ``` source { Jdbc { url = "jdbc:db2://127.0.0.1:50000/dbname" driver = "com.ibm.db2.jdbc.app.DB2Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需求定义查询逻辑 query = "select * from type_bin" partition_column = "id" # 读取起始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 partition_num = 10 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Databend.md ================================================ import ChangeLog from '../changelog/connector-databend.md'; # Databend > Databend 源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) - [ ] [支持多表读](../../introduction/concepts/connector-v2-features.md) ## 描述 用于从 Databend 读取数据的源连接器。 ## 依赖 ### 对于 Spark/Flink > 1. 你需要下载 [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) 并添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta > 1. 你需要下载 [Databend JDBC driver jar package](https://github.com/databendlabs/databend-jdbc/) 并添加到目录 `${SEATUNNEL_HOME}/lib/`. ## 支持的数据源信息 | 数据源 | 支持版本 | 驱动 | Url | Maven | |--------|----------|------|-----|-------| | Databend | 1.2.x 及以上版本 | - | - | - | ## 数据类型映射 | Databend 数据类型 | SeaTunnel 数据类型 | |-----------------|------------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | INT | INT | | BIGINT | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL | DECIMAL | | STRING | STRING | | VARCHAR | STRING | | CHAR | STRING | | TIMESTAMP | TIMESTAMP | | DATE | DATE | | TIME | TIME | | BINARY | BYTES | ## 源选项 基础配置: | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |------|------|----------|--------|------| | url | String | 是 | - | Databend JDBC 连接 URL | | username | String | 是 | - | Databend 数据库用户名 | | password | String | 是 | - | Databend 数据库密码 | | database | String | 否 | - | Databend 数据库名称,默认使用连接 URL 中指定的数据库名 | | table | String | 否 | - | Databend 表名称 | | query | String | 否 | - | Databend 查询语句,如果设置将覆盖 database 和 table 的设置 | | fetch_size | Integer | 否 | 0 | 一次从数据库中获取的记录数,设置为0使用JDBC驱动默认值 | | jdbc_config | Map | 否 | - | 额外的 JDBC 连接配置,如加载均衡策略等 | 表清单配置: | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |------|------|----------|--------|------| | database | String | 是 | - | 数据库名称 | | table | String | 是 | - | 表名称 | | query | String | 否 | - | 自定义查询语句 | | fetch_size | Integer | 否 | 0 | 一次从数据库中获取的记录数 | 注意: 当此配置对应于单个表时,您可以将 table_list 中的配置项展平到外层。 ## 任务示例 ### 单表读取 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" database = "default" table = "users" } } sink { Console {} } ``` ### 使用自定义查询 ```hocon source { Databend { url = "jdbc:databend://localhost:8000" username = "root" password = "" query = "SELECT id, name, age FROM default.users WHERE age > 18" } } ``` ## 相关链接 - [Databend 官方网站](https://databend.rs/) - [Databend JDBC 驱动](https://github.com/databendlabs/databend-jdbc/) ## Changelog ================================================ FILE: docs/zh/connectors/source/Doris.md ================================================ import ChangeLog from '../changelog/connector-doris.md'; # Doris > Doris 源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表读](../../introduction/concepts/connector-v2-features.md) ## 描述 用于 Apache Doris 的源连接器。 ## 依赖 ### 对于 Spark/Flink > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/plugins/`. ### 对于 SeaTunnel Zeta > 1. 你需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 并添加到目录 `${SEATUNNEL_HOME}/lib/`. ## 支持的数据源信息 | 数据源 | 支持版本 | 驱动 | Url | Maven | |------------|--------------------------------------|--------|-----|-------| | Doris | 仅支持Doris2.0及以上版本. | - | - | - | ## 数据类型映射 | Doris 数据类型 | SeaTunnel 数据类型 | |--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| | INT | INT | | TINYINT | TINYINT | | SMALLINT | SMALLINT | | BIGINT | BIGINT | | LARGEINT | STRING | | BOOLEAN | BOOLEAN | | DECIMAL | DECIMAL((Get the designated column's specified column size)+1,
    (Gets the designated column's number of digits to right of the decimal point.))) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | CHAR
    VARCHAR
    STRING
    TEXT | STRING | | DATE | DATE | | DATETIME
    DATETIME(p) | TIMESTAMP | | ARRAY | ARRAY | ## 源选项 基础配置: | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------| | fenodes | string | yes | - | FE 地址, 格式:`"fe_host:fe_http_port"` | | username | string | yes | - | 用户名 | | password | string | yes | - | 密码 | | doris.request.retries | int | no | 3 | 请求Doris FE的重试次数 | | doris.request.read.timeout.ms | int | no | 30000 | | | doris.request.connect.timeout.ms | int | no | 30000 | | | query-port | string | no | 9030 | Doris查询端口 | | doris.request.query.timeout.s | int | no | 3600 | Doris扫描数据的超时时间,单位秒 | | table_list | string | 否 | - | 表清单 | 表清单配置: | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------------------------|--------|----------|------------|-----------------------------------------------------------------------------------------------------| | database | string | yes | - | 数据库 | | table | string | yes | - | 表名 | | doris.read.field | string | no | - | 选择要读取的Doris表字段 | | doris.filter.query | string | no | - | 数据过滤. 格式:"字段 = 值", 例如:doris.filter.query = "F_ID > 2" | | doris.batch.size | int | no | 1024 | 每次能够从BE中读取到的最大行数 | | doris.exec.mem.limit | long | no | 2147483648 | 单个be扫描请求可以使用的最大内存。默认内存为2G(2147483648) | 注意: 当此配置对应于单个表时,您可以将table_list中的配置项展平到外层。 ### 提示 > 不建议随意修改高级参数 ## 例子 ### 单表 > 这是一个从doris读取数据后,输出到控制台的例子: ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` 使用`doris.read.field`参数来选择需要读取的Doris表字段: ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT,F_SMALLINT" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` 使用`doris.filter.query`来过滤数据,参数值将作为过滤条件直接传递到doris: ``` env { parallelism = 2 job.mode = "BATCH" } source{ Doris { fenodes = "doris_e2e:8030" username = root password = "" database = "e2e_source" table = "doris_e2e_table" doris.filter.query = "F_ID > 2" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### 多表 ``` env{ parallelism = 1 job.mode = "BATCH" } source{ Doris { fenodes = "xxxx:8030" username = root password = "" table_list = [ { database = "st_source_0" table = "doris_table_0" doris.read.field = "F_ID,F_INT,F_BIGINT,F_TINYINT" doris.filter.query = "F_ID >= 50" }, { database = "st_source_1" table = "doris_table_1" } ] } } transform {} sink{ Doris { fenodes = "xxxx:8030" schema_save_mode = "RECREATE_SCHEMA" username = root password = "" database = "st_sink" table = "${table_name}" sink.enable-2pc = "true" sink.label-prefix = "test_json" doris.config = { format="json" read_json_by_line="true" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/DuckDB.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # DuckDB > JDBC DuckDB 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持 DuckDB 版本 - 0.8.x/0.9.x/0.10.x/1.x ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) > 支持 SQL 查询,并能实现列投影效果 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动器 | 网址 | Maven下载链接 | |--------|--------------------|-------------------------|----------------------------------|-----------------------------------------------------------------| | DuckDB | 不同的依赖版本具有不同的驱动程序类。 | org.duckdb.DuckDBDriver | jdbc:duckdb:/path/to/database.db | [下载](https://mvnrepository.com/artifact/org.duckdb/duckdb_jdbc) | ## 数据类型映射 | DuckDB 数据类型 | SeaTunnel 数据类型 | |----------------------------------------------------------|----------------| | BOOLEAN | BOOLEAN | | TINYINT | TINYINT | | UTINYINT
    SMALLINT | SMALLINT | | USMALLINT
    INTEGER | INT | | UINTEGER
    BIGINT | BIGINT | | UBIGINT | DECIMAL(20,0) | | HUGEINT | DECIMAL(38,0) | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DECIMAL(x,y)(获取指定列的指定列大小.<38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的指定列大小.>38) | DECIMAL(38,18) | | VARCHAR
    CHAR
    TEXT
    JSON
    UUID
    INTERVAL | STRING | | DATE | DATE | | TIME | TIME | | TIMESTAMP
    TIMESTAMP WITH TIME ZONE | TIMESTAMP | | BLOB
    ARRAY
    STRUCT
    MAP | BYTES | ## 源选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |------------------------------|------------|------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考案例:jdbc:duckdb:/path/to/database.db | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,
    如果您使用 DuckDB,值为 `org.duckdb.DuckDBDriver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(以秒为单位) | | partition_column | String | 否 | - | 并行度分区的列名,仅支持数字类型主键,并且只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | 扫描的 partition_column 最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | 扫描的 partition_column 最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分区计数的数量,仅支持正整数。默认值为作业并行度 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置
    查询中使用的行获取大小来通过
    减少满足选择条件所需的数据库命中次数来提高性能。
    零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 附加连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
    驱动程序的具体实现确定。例如,在 DuckDB 中,properties 优先于 URL。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    duckdb: "main.table1"
    | | table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置代替 `table_path` 示例:```[{ table_path = "main.table1"}, {table_path = "main.table2", query = "select * id, name from main.table2"}]``` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。例如 `where id > 100` | | split.size | Int | 否 | 8096 | 表的拆分大小(行数),读取表时捕获的表被拆分为多个拆分。 | | common-options | | 否 | - | 源插件通用参数,详情请参考 [Source Common Options](../source-common-options.md) | ## 并行读取器 JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用某些规则来拆分表中的数据,这些数据将交给读取器进行读取。读取器的数量由 `parallelism` 选项确定。 **拆分键规则:** 1. 如果 `partition_column` 不为空,它将用于计算拆分。该列必须在 **支持的拆分数据类型** 中。 2. 如果 `partition_column` 为空,seatunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多个列,将使用 **支持的拆分数据类型** 中的第一列来拆分数据。例如,表有主键(nn guid, name varchar),因为 `guid` 不在 **支持的拆分数据类型** 中,所以列 `name` 将用于拆分数据。 **支持的拆分数据类型:** * String * Number(int, bigint, decimal, ...) * Date ### 与拆分相关的选项 #### split.size 一个拆分中有多少行,读取表时捕获的表被拆分为多个拆分。 #### partition_column [string] 用于拆分数据的列名。 #### partition_upper_bound [BigDecimal] 扫描的 partition_column 最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 #### partition_lower_bound [BigDecimal] 扫描的 partition_column 最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 #### partition_num [int] > 不建议使用,正确的方法是通过 `split.size` 控制拆分数量 我们需要拆分成多少个拆分,仅支持正整数。默认值为作业并行度。 ## 提示 > 如果表无法拆分(例如,表没有主键或唯一索引,并且未设置 `partition_column`),它将以单一并发运行。 > > 使用 `table_path` 替换 `query` 进行单表读取。如果您需要读取多个表,请使用 `table_list`。 ## 任务示例 ### 简单 > 此示例在单个并行中查询测试数据库中的 'user_events' 表并查询其所有字段。您还可以指定要查询的字段以最终输出到控制台。 ``` # 定义运行时环境 env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" query = "select * from user_events limit 16" } } transform { # 如果您想了解更多关于如何配置 seatunnel 和查看转换插件的完整列表, # 请访问 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 通过 partition_column 并行 ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" query = "select * from user_events" partition_column = "id" split.size = 10000 # 读取开始边界 #partition_lower_bound = ... # 读取结束边界 #partition_upper_bound = ... } } sink { Console {} } ``` ### 通过主键或唯一索引并行 > 配置 `table_path` 将开启自动拆分,您可以配置 `split.*` 来调整拆分策略 ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" table_path = "main.user_events" query = "select * from main.user_events" split.size = 10000 } } sink { Console {} } ``` ### 并行边界 > 指定查询的上下边界内的数据更高效,根据您配置的上下边界读取数据源更高效 ``` source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" # 根据需要定义查询逻辑 query = "select * from user_events" partition_column = "id" # 读取开始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 partition_num = 10 properties { threads=4 memory_limit="4GB" } } } ``` ### 多表读取 ***配置 `table_list` 将开启自动拆分,您可以配置 `split.*` 来调整拆分策略*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:duckdb:/tmp/test.db" driver = "org.duckdb.DuckDBDriver" connection_check_timeout_sec = 100 username = "duckdb" password = "" table_list = [ { table_path = "main.table1" }, { table_path = "main.table2" # 使用查询过滤行和列 query = "select id, name from main.table2 where id > 100" } ] #where_condition= "where id > 100" #split.size = 8096 } } sink { Console {} } ``` ## Changelog ================================================ FILE: docs/zh/connectors/source/Easysearch.md ================================================ import ChangeLog from '../changelog/connector-easysearch.md'; # Easysearch > Easysearch 源连接器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 用于从INFINI Easysearch读取数据。 ## 使用依赖 > 依赖 [easysearch-client](https://central.sonatype.com/artifact/com.infinilabs/easysearch-client) ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列映射](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) :::提示 支持的引擎 * 支持发布的所有版本 [INFINI Easysearch](https://www.infini.com/download/?product=easysearch). ## 数据类型映射 | Easysearch 数据类型 | SeaTunnel 数据类型 | |-----------------------------|----------------------| | STRING
    KEYWORD
    TEXT | STRING | | BOOLEAN | BOOLEAN | | BYTE | BYTE | | SHORT | SHORT | | INTEGER | INT | | LONG | LONG | | FLOAT
    HALF_FLOAT | FLOAT | | DOUBLE | DOUBLE | | Date | LOCAL_DATE_TIME_TYPE | ### hosts [array] Easysearch集群http地址,格式为“host:port”,允许指定多个主机。例如`[“host1:9200”,“host2:9200”]`。 ### username [string] 安全用户名。 ### password [string] 安全密码。 ### index [string] Easysearch搜索索引名称,支持*模糊匹配。 ### source [array] 索引字段。 您可以通过指定字段“_id”来获取文档id。如果sink_id指向其他索引,由于Easysearch的限制,您需要为_id指定一个别名。 若不配置源代码,则必须配置`schema`。 ### query [json] Easysearch DSL. 您可以控制读取数据的范围。 ### scroll_time [String] Easysearch将为滚动请求保持搜索上下文活动的时间量。 ### scroll_size [int] 每次Easysearch滚动请求返回的最大请求数。 ### schema 数据的结构,包括字段名和字段类型。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 如果不配置schema,则必须配置`source`。 ### tls_verify_certificate [boolean] 为HTTPS端点启用证书验证 ### tls_verify_hostname [boolean] 为HTTPS端点启用主机名验证 ### tls_keystore_path [string] PEM或JKS密钥存储的路径。运行SeaTunnel的操作系统用户必须能够读取此文件。 ### tls_keystore_password [string] 指定密钥存储的密钥密码 ### tls_truststore_path [string] PEM或JKS信任存储的路径。运行SeaTunnel的操作系统用户必须能够读取此文件. ### tls_truststore_password [string] 指定的信任存储的密钥密码 ### common options Source插件常用参数,详见[Source common Options](../common-options/source-common-options.md) ## 示例 简单的例子 ```hocon Easysearch { hosts = ["localhost:9200"] index = "seatunnel-*" source = ["_id","name","age"] query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} } ``` 复杂的例子 ```hocon Easysearch { hosts = ["Easysearch:9200"] index = "st_index" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } query = {"range":{"firstPacket":{"gte":1700407367588,"lte":1700407367588}}} } ``` SSL (禁用证书验证) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_certificate = false } } ``` SSL (禁用主机名验证) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_verify_hostname = false } } ``` SSL (启用证书验证) ```hocon source { Easysearch { hosts = ["https://localhost:9200"] username = "admin" password = "admin" tls_keystore_path = "${your Easysearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Elasticsearch.md ================================================ import ChangeLog from '../changelog/connector-elasticsearch.md'; # Elasticsearch > Elasticsearch source 连接器 ## 简介 支持读取 Elasticsearch2.x 版本和 8.x 版本之间的数据 ## Key features - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精准一次](../../introduction/concepts/connector-v2-features.md) - [x] [column projection](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义的分片](../../introduction/concepts/connector-v2-features.md) ## 配置参数选项 | 参数名称 | 类型 | 是否必须 | 默认值或者描述 | | ----------------------- | ------- | -------- |-------------------------------------| | hosts | 数组 | yes | - | | auth_type | string | no | basic | | username | string | no | - | | password | string | no | - | | auth.api_key_id | string | no | - | | auth.api_key | string | no | - | | auth.api_key_encoded | string | no | - | | index | string | No | 单索引同步配置,如果index_list没有配置,则必须配置index | | index_list | array | no | 用来定义多索引同步任务 | | source | array | no | - | | query | json | no | {"match_all": {}} | | search_type | enum | no | 查询类型,SQL 或 DSL,默认 DSL | | search_api_type | enum | no | 分页 API 类型,SCROLL 或 PIT,默认 SCROLL | | sql_query | json | no | SQL 查询语句,当 search_type 为 SQL 时必须 | | scroll_time | string | no | 1m | | scroll_size | int | no | 100 | | tls_verify_certificate | boolean | no | true | | tls_verify_hostname | boolean | no | true | | array_column | map | no | | | tls_keystore_path | string | no | - | | tls_keystore_password | string | no | - | | tls_truststore_path | string | no | - | | tls_truststore_password | string | no | - | | pit_keep_alive | long | no | 60000 (1 minute) | | pit_batch_size | int | no | 100 | | runtime_fields | array | no | - | | common-options | | no | - | ### hosts [array] Elasticsearch 集群的 HTTP 地址,格式为 `host:port`,允许指定多个主机。例如:`["host1:9200", "host2:9200"]`。 ## 认证 Elasticsearch 连接器支持多种认证方式,可根据集群的安全配置进行选择。 ### auth_type [enum] 指定认证方式,支持: - `basic`(默认):使用用户名 + 密码的 HTTP 基本认证 - `api_key`:使用 API Key 的 ID + key 认证 - `api_key_encoded`:使用 Base64 编码后的 API Key 认证 如果未指定,默认使用 `basic` 以兼容旧版本。 ### 基本认证 #### username [string] 基本认证的用户名(x-pack 用户名)。 #### password [string] 基本认证的密码(x-pack 密码)。 **示例:** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "basic" username = "elastic" password = "your_password" index = "my_index" } } ``` ### API Key 认证 #### auth.api_key_id [string] Elasticsearch 生成的 API Key ID。 #### auth.api_key [string] Elasticsearch 生成的 API Key 密钥。 #### auth.api_key_encoded [string] `base64(id:api_key)` 形式的 Base64 编码 API Key,可替代单独提供 ID 与 key。 **注意:** `auth.api_key_id` + `auth.api_key` 与 `auth.api_key_encoded` 只能二选一。 **示例(分开配置 ID 和 key):** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key" auth.api_key_id = "your_api_key_id" auth.api_key = "your_api_key_secret" index = "my_index" } } ``` **示例(使用编码 key):** ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] auth_type = "api_key_encoded" auth.api_key_encoded = "eW91cl9hcGlfa2V5X2lkOnlvdXJfYXBpX2tleV9zZWNyZXQ=" index = "my_index" } } ``` ### index [string] Elasticsearch 索引名称,支持 * 模糊匹配。比如存在索引index1,index2,可以指定index*同时读取两个索引的数据。 ### source [array] 索引的字段 你可以通过指定字段 `_id` 来获取文档 ID。如果将 `_id` 写入到其他索引,由于 Elasticsearch 的限制,你需要为 `_id` 指定一个别名。 如果你没有配置 `source`,它将自动从索引的映射中获取。 ### array_column [map] 由于 Elasticsearch 中没有数组索引,因此需要指定数组类型。 假设tags和phones是数组类型: ```hocon array_column = {tags = "array",phones = "array"} ``` ### query [json] Elasticsearch 的原生查询语句,用于控制读取哪些数据写入到其他数据源。 ### scroll_time [String] `Seatunnel`底层会使用滚动查询来查询数据,所以需要使用这个参数控制搜索上下文的时间长度。 ### scroll_size [int] 滚动查询的最大文档数量。 ### index_list [array] `index_list` 用于定义多索引同步任务。它是一个数组,包含单表同步所需的参数,如 `query`、`source/schema`、`scroll_size` 和 `scroll_time`。建议不要将 `index_list` 和 `query` 配置在同一层级。有关更多详细信息,请参考后面的多表同步示例。 ### tls_verify_certificate [boolean] 启用 HTTPS 端点的证书验证 ### tls_verify_hostname [boolean] 启用 HTTPS 端点的主机名验证 ### tls_keystore_path [string] PEM 或 JKS 密钥库的路径。该文件必须对运行 SeaTunnel 的操作系统用户可读。 ### tls_keystore_password [string] 指定密钥库的密钥密码。 ### tls_truststore_path [string] PEM 或 JKS 信任库的路径。该文件必须对运行 SeaTunnel 的操作系统用户可读。 ### tls_truststore_password [string] 指定信任库的密钥密码。 ### search_type 查询类型,可选值: - DSL: 使用 Domain Specific Language 查询(默认) - SQL: 使用 SQL 查询 ### search_api_type 分页 API 类型,可选值: - SCROLL: 使用 Scroll API 进行分页(默认) - PIT: 使用 Point in Time (PIT) API 进行分页 ### pit_keep_alive [long] PIT 应保持活动的时间量(以毫秒为单位) ### pit_batch_size [int] 每次 PIT 搜索请求返回的最大数量 ### runtime_fields [array] 在查询时动态计算字段(Elasticsearch 7.11+)。每个 runtime field 需要包含: - **name**: 字段名 - **type**: 数据类型(boolean, date, double, geo_point, ip, keyword, long) - **script**: Painless 脚本,用于计算字段值 - **script_lang** (可选): 脚本语言(默认:painless) - **script_params** (可选): 脚本参数 示例: ```hocon runtime_fields = [ { name = "day_of_week" type = "keyword" script = "emit(doc['timestamp'].value.dayOfWeekEnum.toString())" }, { name = "total_price" type = "double" script = "emit(doc['quantity'].value * doc['price'].value)" } ] ``` **性能与限制:** - 运行时字段在查询阶段计算,数据量大时会影响性能 - 适合临时分析、字段试验与低频查询 - 需要 Elasticsearch 7.11 及以上版本 ### common options Source 插件常用参数,具体请参考 [Source 常用选项](../common-options/source-common-options.md) ## 使用案例 案例一 > 案例一会从满足seatunnel-*匹配的索引中按照query读取数据,查询只会返回文档`id`,`name`,`age`,`tags`,`phones` 三个字段。在这个例子中,使用了source字段配置应该读取哪些字段,使用`array_column`指定了`tags`,`phones`应该被当做数组处理。 ```hocon Elasticsearch { hosts = ["localhost:9200"] index = "seatunnel-*" array_column = {tags = "array",phones = "array"} source = ["_id","name","age","tags","phones"] query = {"range":{"firstPacket":{"gte":1669225429990,"lte":1669225429990}}} } ``` 案例二:多索引同步 > 此示例演示了如何从 `read_index1` 和 `read_index2` 中读取不同的数据,并将其分别写入 `read_index1_copy`,`read_index2_copy` 索引。 > 在 `read_index1` 中,我使用 `source` 来指定要读取的字段,并使用`array_column`指明哪些字段是数组字段。 ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index_list = [ { index = "read_index1" query = {"range": {"c_int": {"gte": 10, "lte": 20}}} source = [ c_map, c_array, c_string, c_boolean, c_tinyint, c_smallint, c_bigint, c_float, c_double, c_decimal, c_bytes, c_int, c_date, c_timestamp ] array_column = { c_array = "array" } } { index = "read_index2" query = {"match_all": {}} source = [ c_int2, c_date2, c_null ] } ] } } transform { } sink { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "multi_source_write_test_index" index_type = "st" "schema_save_mode"="CREATE_SCHEMA_WHEN_NOT_EXIST" "data_save_mode"="APPEND_DATA" } } ``` 案例三:SSL(禁用证书验证) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false } } ``` 案例四:SSL(禁用主机名验证) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_verify_hostname = false } } ``` 案例五:SSL(启用证书验证) ```hocon source { Elasticsearch { hosts = ["https://localhost:9200"] username = "elastic" password = "elasticsearch" tls_keystore_path = "${your elasticsearch home}/config/certs/http.p12" tls_keystore_password = "${your password}" } } ``` 案例六 : sql 方式查询 注意: sql查询不支持map和数组类型 ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "st_index_sql" sql_query = "select * from st_index_sql where c_int>=10 and c_int<=20" search_type = "sql" } } ``` Demo7: PIT方式滚动查询 ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "st_index" query = {"range": {"c_int": {"gte": 10, "lte": 20}}} # 使用 DSL 查询和 PIT API search_type = DSL search_api_type = PIT pit_keep_alive = 60000 # 1 minute in milliseconds pit_batch_size = 100 } } ``` Demo8: Runtime Fields(Elasticsearch 7.11+) > 该示例演示如何在查询时计算字段值,而无需重建索引。 ```hocon source { Elasticsearch { hosts = ["https://elasticsearch:9200"] username = "elastic" password = "elasticsearch" tls_verify_certificate = false tls_verify_hostname = false index = "sales_data" # 定义运行时字段 runtime_fields = [ { name = "total_amount" type = "double" script = "emit(doc['quantity'].value * doc['price'].value)" }, { name = "day_of_week" type = "keyword" script = "emit(doc['order_date'].value.dayOfWeekEnum.getDisplayName(TextStyle.FULL, Locale.ROOT))" }, { name = "order_category" type = "keyword" script = """ double amount = doc['quantity'].value * doc['price'].value; if (amount > 1000) { emit('high_value'); } else if (amount > 100) { emit('medium_value'); } else { emit('low_value'); } """ }, { name = "price_with_tax" type = "double" script = "emit(doc['price'].value * (1 + params.tax_rate))" script_params = { tax_rate = 0.13 } } ] source = [ "product_id", "quantity", "price", "order_date", "total_amount", "day_of_week", "order_category", "price_with_tax" ] schema = { fields { product_id = string quantity = int price = double order_date = timestamp total_amount = double day_of_week = string order_category = string price_with_tax = double } } } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/FakeSource.md ================================================ import ChangeLog from '../changelog/connector-fake.md'; # FakeSource > FakeSource 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 FakeSource 是一个虚拟数据源,它根据用户定义的 schema 数据结构随机生成指定数量的行数据,主要用于类型转换或连接器新功能测试等测试场景。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 数据源选项 | 名称 | 类型 | 必填 | 默认值 | 描述 | |---------------------------|---------|------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | tables_configs | list | 否 | - | 定义多个 FakeSource,每个项可以包含完整的 FakeSource 配置描述 | | schema | config | 是 | - | 定义 Schema 信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | auto.increment.enabled | boolean | 否 | false | 启用自动递增ID | | auto.increment.start | int | 否 | | 自动递增ID的起始值 | | row.num | int | 否 | 5 | 每个并行度生成的数据总行数 | | split.num | int | 否 | 1 | 枚举器为每个并行度生成的分片数量 | | split.read-interval | long | 否 | 1 | 读取器在两个分片读取之间的间隔时间(毫秒) | | map.size | int | 否 | 5 | 连接器生成的 `map` 类型的大小 | | array.size | int | 否 | 5 | 连接器生成的 `array` 类型的大小 | | bytes.length | int | 否 | 5 | 连接器生成的 `bytes` 类型的长度 | | string.length | int | 否 | 5 | 连接器生成的 `string` 类型的长度 | | string.fake.mode | string | 否 | range | 生成字符串数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `string.template` 选项 | | string.template | list | 否 | - | 连接器生成的字符串类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | tinyint.fake.mode | string | 否 | range | 生成 tinyint 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `tinyint.template` 选项 | | tinyint.min | tinyint | 否 | 0 | 连接器生成的 tinyint 数据的最小值 | | tinyint.max | tinyint | 否 | 127 | 连接器生成的 tinyint 数据的最大值 | | tinyint.template | list | 否 | - | 连接器生成的 tinyint 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | smallint.fake.mode | string | 否 | range | 生成 smallint 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `smallint.template` 选项 | | smallint.min | smallint | 否 | 0 | 连接器生成的 smallint 数据的最小值 | | smallint.max | smallint | 否 | 32767 | 连接器生成的 smallint 数据的最大值 | | smallint.template | list | 否 | - | 连接器生成的 smallint 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | int.fake.template | string | 否 | range | 生成 int 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `int.template` 选项 | | int.min | smallint | 否 | 0 | 连接器生成的 int 数据的最小值 | | int.max | smallint | 否 | 0x7fffffff | 连接器生成的 int 数据的最大值 | | int.template | list | 否 | - | 连接器生成的 int 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | bigint.fake.mode | string | 否 | range | 生成 bigint 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `bigint.template` 选项 | | bigint.min | bigint | 否 | 0 | 连接器生成的 bigint 数据的最小值 | | bigint.max | bigint | 否 | 0x7fffffffffffffff | 连接器生成的 bigint 数据的最大值 | | bigint.template | list | 否 | - | 连接器生成的 bigint 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | float.fake.mode | string | 否 | range | 生成 float 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `float.template` 选项 | | float.min | float | 否 | 0 | 连接器生成的 float 数据的最小值 | | float.max | float | 否 | 0x1.fffffeP+127 | 连接器生成的 float 数据的最大值 | | float.template | list | 否 | - | 连接器生成的 float 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | double.fake.mode | string | 否 | range | 生成 double 数据的伪数据模式,支持 `range` 和 `template`,默认为 `range`,如果配置为 `template`,用户还需配置 `double.template` 选项 | | double.min | double | 否 | 0 | 连接器生成的 double 数据的最小值 | | double.max | double | 否 | 0x1.fffffffffffffP+1023 | 连接器生成的 double 数据的最大值 | | double.template | list | 否 | - | 连接器生成的 double 类型的模板列表,如果用户配置了此选项,连接器将从模板列表中随机选择一个项 | | vector.dimension | int | 否 | 4 | 生成的向量的维度,不包括二进制向量 | | binary.vector.dimension | int | 否 | 8 | 生成的二进制向量的维度 | | vector.float.min | float | 否 | 0 | 连接器生成的向量中 float 数据的最小值 | | vector.float.max | float | 否 | 0x1.fffffeP+127 | 连接器生成的向量中 float 数据的最大值 | | common-options | | 否 | - | 数据源插件通用参数,详情请参考 [Source Common Options](../common-options/source-common-options.md) | ## 任务示例 ### 简单示例 > 此示例随机生成指定类型的数据。如果您想了解如何声明字段类型,请点击 [这里](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported)。 ```hocon schema = { fields { c_map = "map>" c_map_nest = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } ``` ### 随机生成 > 随机生成 16 条符合类型的数据 ```hocon source { # 这是一个示例输入插件,**仅用于测试和演示功能输入插件** FakeSource { row.num = 16 schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } plugin_output = "fake" } } ``` ### 自定义数据内容简单示例 > 这是一个自定义数据源信息的示例,定义每条数据是添加还是删除修改操作,并定义每个字段存储的内容 ```hocon source { FakeSource { schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } rows = [ { kind = INSERT fields = [{"a": "b"}, [101], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = UPDATE_BEFORE fields = [{"a": "c"}, [102], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = UPDATE_AFTER fields = [{"a": "e"}, [103], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } { kind = DELETE fields = [{"a": "f"}, [104], "c_string", true, 117, 15987, 56387395, 7084913402530365000, 1.23, 1.23, "2924137191386439303744.39292216", null, "bWlJWmo=", "2023-04-22", "2023-04-22T23:20:58"] } ] } } ``` > 由于 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) 规范的限制,用户无法直接创建字节序列对象。FakeSource 使用字符串来分配 `bytes` 类型的值。在上面的示例中,`bytes` 类型字段被分配了 `"bWlJWmo="`,这是通过 **base64** 编码的 "miIZj"。因此,在为 `bytes` 类型字段赋值时,请使用 **base64** 编码的字符串。 ### 指定数据数量简单示例 > 此案例指定生成数据的数量以及生成值的长度 ```hocon FakeSource { row.num = 10 map.size = 10 array.size = 10 bytes.length = 10 string.length = 10 schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } ``` ### 模板数据简单示例 > 根据指定模板随机生成 使用模板 ```hocon FakeSource { row.num = 5 string.fake.mode = "template" string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] tinyint.fake.mode = "template" tinyint.template = [1, 2, 3, 4, 5, 6, 7, 8, 9] smalling.fake.mode = "template" smallint.template = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] int.fake.mode = "template" int.template = [20, 21, 22, 23, 24, 25, 26, 27, 28, 29] bigint.fake.mode = "template" bigint.template = [30, 31, 32, 33, 34, 35, 36, 37, 38, 39] float.fake.mode = "template" float.template = [40.0, 41.0, 42.0, 43.0] double.fake.mode = "template" double.template = [44.0, 45.0, 46.0, 47.0] schema { fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ``` ### 范围数据简单示例 > 在指定的数据生成范围内随机生成 ```hocon FakeSource { row.num = 5 string.template = ["tyrantlucifer", "hailin", "kris", "fanjia", "zongwen", "gaojun"] tinyint.min = 1 tinyint.max = 9 smallint.min = 10 smallint.max = 19 int.min = 20 int.max = 29 bigint.min = 30 bigint.max = 39 float.min = 40.0 float.max = 43.0 double.min = 44.0 double.max = 47.0 schema { fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ``` ### 生成多张表 > 这是一个生成多数据源测试表 `test.table1` 和 `test.table2` 的示例 ```hocon FakeSource { tables_configs = [ { row.num = 16 schema { table = "test.table1" fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } }, { row.num = 17 schema { table = "test.table2" fields { c_string = string c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double } } } ] } ``` ### `rows` 选项示例 ```hocon rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_1", 100] }, { kind = DELETE fields = [1, "A_1", 100] } ] ``` ### `table-names` 选项示例 ```hocon source { # 这是一个示例源插件,**仅用于测试和演示源插件功能** FakeSource { table-names = ["test.table1", "test.table2", "test.table3"] parallelism = 1 schema = { fields { name = "string" age = "int" } } } } ``` ### `defaultValue` 选项示例 可以通过 `row` 和 `columns` 生成自定义数据。对于时间类型,可以通过 `CURRENT_TIMESTAMP`、`CURRENT_TIME`、`CURRENT_DATE` 获取当前时间。 ```hocon schema = { fields { pk_id = bigint name = string score = int time1 = timestamp time2 = time time3 = date } } # 使用 rows rows = [ { kind = INSERT fields = [1, "A", 100, CURRENT_TIMESTAMP, CURRENT_TIME, CURRENT_DATE] } ] ``` ```hocon schema = { # 使用 columns columns = [ { name = book_publication_time type = timestamp defaultValue = "2024-09-12 15:45:30" comment = "书籍出版时间" }, { name = book_publication_time2 type = timestamp defaultValue = CURRENT_TIMESTAMP comment = "书籍出版时间2" }, { name = book_publication_time3 type = time defaultValue = "15:45:30" comment = "书籍出版时间3" }, { name = book_publication_time4 type = time defaultValue = CURRENT_TIME comment = "书籍出版时间4" }, { name = book_publication_time5 type = date defaultValue = "2024-09-12" comment = "书籍出版时间5" }, { name = book_publication_time6 type = date defaultValue = CURRENT_DATE comment = "书籍出版时间6" } ] } ``` ### 使用向量示例 ```hocon source { FakeSource { row.num = 10 # 低优先级 vector.dimension= 4 binary.vector.dimension = 8 # 低优先级 schema = { table = "simple_example" columns = [ { name = book_id type = bigint nullable = false defaultValue = 0 comment = "主键 ID" }, { name = book_intro_1 type = binary_vector columnScale =8 comment = "向量" }, { name = book_intro_2 type = float16_vector columnScale =4 comment = "向量" }, { name = book_intro_3 type = bfloat16_vector columnScale =4 comment = "向量" }, { name = book_intro_4 type = sparse_float_vector columnScale =4 comment = "向量" } ] } } } ``` ### 自增主键示例 ```hocon source { # This is a example source plugin **only for test and demonstrate the feature source plugin** FakeSource { plugin_output = "fake" auto.increment.enabled = true auto.increment.start = 1000 row.num = 50000 schema = { fields { id = "int" name = "string" age = "int" } primaryKey { name = "pk" columnNames = [id] } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/FtpFile.md ================================================ import ChangeLog from '../changelog/connector-file-ftp.md'; # FtpFile > Ftp 文件 Source 连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次处理](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] 文本 - [x] CSV - [x] JSON - [x] Excel - [x] XML - [x] 二进制 ## 描述 从 FTP 文件服务器读取数据。 :::提示 如果您使用 Spark/Flink,为了使用此连接器,您必须确保您的 Spark/Flink 集群已经集成了 Hadoop。测试的 Hadoop 版本为 2.x。 如果您使用 SeaTunnel Engine,当您下载并安装 SeaTunnel Engine 时,它会自动集成 Hadoop 的 jar 包。您可以在 `${SEATUNNEL_HOME}/lib` 目录下检查 jar 包以确认这一点。 ::: ## 配置项 | 名称 | 类型 | 是否必填 | 默认值 | |-----------------------------|---------|------|---------------------| | host | string | 是 | - | | port | int | 是 | - | | user | string | 是 | - | | password | string | 是 | - | | path | string | 是 | - | | file_format_type | string | 是 | - | | connection_mode | string | 否 | active_local | | remote_verification_enabled | boolean | no | true | | delimiter/field_delimiter | string | 否 | \001 | | read_columns | list | 否 | - | | parse_partition_from_path | boolean | 否 | true | | date_format | string | 否 | yyyy-MM-dd | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | | time_format | string | 否 | HH:mm:ss | | skip_header_row_number | long | 否 | 0 | | schema | config | 否 | - | | sheet_name | string | 否 | - | | xml_row_tag | string | 否 | - | | xml_use_attr_format | boolean | 否 | - | | csv_use_header_line | boolean | 否 | false | | file_filter_pattern | string | 否 | - | | compress_codec | string | 否 | none | | archive_compress_codec | string | 否 | none | | encoding | string | 否 | UTF-8 | | null_format | string | 否 | - | | binary_chunk_size | int | 否 | 1024 | | binary_complete_file_mode | boolean | 否 | false | | sync_mode | string | 否 | full | | target_path | string | 否 | - | | target_hadoop_conf | map | 否 | - | | update_strategy | string | 否 | distcp | | compare_mode | string | 否 | len_mtime | | common-options | | 否 | - | | file_filter_modified_start | string | 否 | - | | file_filter_modified_end | string | 否 | - | | quote_char | string | 否 | " | | escape_char | string | 否 | - | | metalake_type | string | 否 | gravitino | ### host [string] 目标 FTP 主机地址,必填项。 ### port [int] 目标 FTP 端口,必填项。 ### user [string] 目标 FTP 用户名,必填项。 ### password [string] 目标 FTP 密码,必填项。 ### path [string] 源文件路径。 ### remote_verification_enabled [boolean] 是否启用FTP数据通道的远程主机验证。默认值为 `true`。 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考:https://en.wikipedia.org/wiki/Regular_expression. 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例 1**:*匹配所有 .txt 文件*,正则表达式: ``` .*.txt ``` 该示例匹配结果为: ``` /data/seatunnel/20241001/report.txt ``` **示例 2**:*匹配所有以 abc 开头的文件*,正则表达式: ``` abc.* ``` 该示例匹配结果为: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例 3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 该示例匹配结果为: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例 4**:*匹配第三级文件夹以 202410 开头且文件以 .csv 结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 该示例匹配结果为: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为 `json`,您还需要指定 schema 选项以告诉连接器如何将数据解析为您所需的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您应按如下方式指定 schema: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 如果您将文件类型指定为 `text` 或 `csv`,您可以选择是否指定 schema 信息。 例如,上游数据如下: ```text tyrantlucifer#26#male ``` 如果您不指定数据 schema,连接器将按如下方式处理上游数据: | content | |-----------------------| | tyrantlucifer#26#male | 如果您指定数据 schema,您还需要指定 `field_delimiter` 选项(CSV 文件类型除外)。 您应按如下方式指定 schema 和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将生成如下数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | 如果您将文件类型指定为 `binary`,SeaTunnel 可以同步任何格式的文件, 例如压缩包、图片等。简而言之,任何文件都可以同步到目标位置。 在这种情况下,您需要确保源和接收端同时使用 `binary` 格式进行文件同步。 您可以在下面的示例中找到具体用法。 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 ### connection_mode [string] 目标 FTP 连接模式,默认为主动模式,支持以下模式: `active_local` `passive_local` ### control_encoding [string] FTP 控制连接的字符编码。默认为 `UTF-8`。 当文件路径包含特殊字符(如 `$`、空格、中文字符等)时,需要设置为 `UTF-8` 以确保路径能够正确解析。 例如:`/data/whale_ops/share/$Fund-Product/DA - SANY (三一)/Daily/2025.08.18/file.xlsx` ### delimiter/field_delimiter [string] **delimiter** 参数将在 2.3.5 版本后弃用,请使用 **field_delimiter** 代替。 仅在文件格式为 text 时需要配置。 字段分隔符,用于告诉连接器如何切分字段。 默认值为 `\001`,与 Hive 的默认分隔符相同。 ### parse_partition_from_path [boolean] 控制是否从文件路径中解析分区键和值。 例如,如果您从路径 `ftp://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` 读取文件, 文件中的每条记录数据将添加以下两个字段: | name | age | |---------------|-----| | tyrantlucifer | 26 | 提示:**不要在 schema 选项中定义分区字段** ### date_format [string] 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` 默认值为 `yyyy-MM-dd` ### datetime_format [string] 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` 默认值为 `yyyy-MM-dd HH:mm:ss` ### time_format [string] 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式: `HH:mm:ss` `HH:mm:ss.SSS` 默认值为 `HH:mm:ss` ### skip_header_row_number [long] 跳过前几行,仅适用于 txt 和 csv 文件。 例如,设置如下: `skip_header_row_number = 2` SeaTunnel 将从源文件中跳过前 2 行。 ### schema [config] 仅在文件格式类型为 text、json、excel、xml 或 csv(或其他无法从元数据中读取 schema 的格式)时需要配置。 上游数据的 schema 信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 ### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ### read_columns [list] 数据源的读取列列表,用户可以使用它来实现字段投影。 ### sheet_name [string] 读取工作簿中的工作表,仅在文件格式类型为 excel 时使用。 ### xml_row_tag [string] 仅在文件格式为 xml 时需要配置。 指定 XML 文件中数据行的标签名称。 ### xml_use_attr_format [boolean] 仅在文件格式为 xml 时需要配置。 指定是否使用标签属性格式处理数据。 ### csv_use_header_line [boolean] 仅在文件格式为 csv 时可以选择配置。 是否使用标题行来解析文件, 标题行 与 RFC 4180 匹配 ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器,支持的详细信息如下: | archive_compress_codec | 文件格式 | 归档压缩后缀 | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz 压缩的 excel 文件需要压缩原始文件或指定文件后缀,例如 e2e.xls ->e2e_test.xls.gz ### encoding [string] 仅在文件格式类型为 json、text、csv、xml 时使用。 读取文件的编码。此参数将通过 `Charset.forName(encoding)` 解析。 ### null_format [string] 仅在文件格式类型为 text 时使用。 用于定义哪些字符串可以表示为 null。 例如:`\N` ### binary_chunk_size [int] 仅在 file_format_type 为 binary 时使用。 读取二进制文件的块大小(以字节为单位)。默认为 1024 字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在 file_format_type 为 binary 时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为 false。 ### sync_mode [string] 文件同步模式,支持:`full`(默认)、`update`。 当 `update` 时,对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 **性能注意事项** - Update 模式会对每个源文件额外发起一次到目标端的 `getFileStatus` 用于对比。 - 对于远程文件系统(FTP/SFTP),会带来按文件的网络开销,不建议用于海量小文件场景。 **要求 / 限制** - `target_path` 通常应与 sink 的 `path` 一致(同一文件系统且相对路径结构一致)。 - 使用 `update_strategy=distcp` 时,依赖源/目标端时钟同步,否则可能误判。 - 使用 `compare_mode=checksum` 时,需要文件系统支持 checksum;若无法获取 checksum,SeaTunnel 会降级为内容比较(开销更大)并打印告警日志。 示例: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] 仅在 `sync_mode=update` 时使用。目标端基础路径(通常应与 sink 的 `path` 一致),用于对比同相对路径文件。 ### target_hadoop_conf [map] 仅在 `sync_mode=update` 时使用。目标端 Hadoop 配置(可选),可在其中设置 `fs.defaultFS` 覆盖目标 defaultFS。 ### update_strategy [string] 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)、`strict`。 ### compare_mode [string] 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)、`checksum`(仅在 `update_strategy=strict` 时可用)。 ### file_filter_modified_start 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### file_filter_modified_end 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### 通用选项 源插件的通用参数,详情请参考 [源通用选项](../common-options/source-common-options.md)。 ## 示例 ```hocon FtpFile { path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "text" schema = { name = string age = int } field_delimiter = "#" } ``` ### 多表配置 ```hocon FtpFile { tables_configs = [ { schema { table = "student" } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" }, { schema { table = "teacher" } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" } ] } ``` ```hocon FtpFile { tables_configs = [ { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" }, { schema { fields { name = string age = int } } path = "/apps/hive/demo/teacher" file_format_type = "json" } } ``` ### 传输二进制文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // 您可以将本地文件传输到 s3/hdfs/oss 等。 FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### 增量同步(sync_mode=update,仅 binary) `sync_mode=update` 会对比 source 与 `target_path`,仅读取新增/变更文件。 多数情况下,`target_path` 需要与 sink 的 `path` 对齐(同一文件系统、相同相对路径)。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" sync_mode = "update" target_path = "/seatunnel/read/binary2/" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary2/" tmp_path = "/seatunnel/read/binary2-tmp/" file_format_type = "binary" } } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FtpFile { host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao path = "/seatunnel/read/binary/" file_format_type = "binary" // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Github.md ================================================ import ChangeLog from '../changelog/connector-http-github.md'; # Github > Github 源连接器 ## 描述 用于从 Github 读取数据。 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必填 | 默认值 | |---------------------------|----------|------|--------| | url | String | 是 | - | | access_token | String | 否 | - | | method | String | 否 | get | | schema.fields | Config | 否 | - | | format | String | 否 | json | | params | Map | 否 | - | | body | String | 否 | - | | json_field | Config | 否 | - | | content_json | String | 否 | - | | poll_interval_millis | int | 否 | - | | retry | int | 否 | - | | retry_backoff_multiplier_ms | int | 否 | 100 | | retry_backoff_max_ms | int | 否 | 10000 | | enable_multi_lines | boolean | 否 | false | | common-options | config | 否 | - | ### url [String] HTTP 请求 URL。 ### access_token [String] GitHub个人访问令牌,请参阅:[创建个人访问令牌 - Github文档](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token) ### method [String] HTTP 请求方法。目前支持 `GET` 和 `POST`。 ### params [Map] http 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 API 的间隔时间(毫秒)。 ### retry [int] 请求失败(`IOException`)时最大重试次数。 ### retry_backoff_multiplier_ms [int] 请求失败时的退避时间(毫秒)乘数。 ### retry_backoff_max_ms [int] 请求失败时的最大退避时间(毫秒)。 ### format [String] 上游数据的格式,现在仅支持`json` `text`,默认是`json`。 若你的数据格式为 `json`,需同时配置 schema 选项,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 您应该配置 schema 为以下内容: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 若你设置格式为 `text`,连接器不会对上游数据做出任何改变,示例: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 连接器将生成如下数据: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] 上游数据的字段定义。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 该参数可用于提取一些 json 数据。如果你只需要 “book” 部分的数据,可以配置 `content_field = "$.store.book.*"`. 如果你的返回数据如下所示: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 你可以配置 `content_field = "$.store.book.*"` 并且结果返回如下: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` 然后你可以通过更简单的 schema 配置获取所需的结果,例如: ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` 这是一个例子: - 测试数据可参考此链接:[mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置示例可参考此链接:[http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] 该参数用于帮助你配置 schema,因此必须与 schema 一起使用。 如果你的数据如下所示: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 你可以通过如下任务配置获取 “book” 部分的内容: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - 测试数据可参考此链接:[mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置示例可参考此链接:[http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### common options 源插件通用参数,请参考 [常用选项](../common-options/source-common-options.md)获取详细说明。 ## 示例 ```hocon Github { url = "https://api.github.com/orgs/apache/repos" access_token = "xxxx" method = "GET" format = "json" schema = { fields { id = int name = string description = string html_url = string stargazers_count = int forks = int } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Gitlab.md ================================================ import ChangeLog from '../changelog/connector-http-gitlab.md'; # Gitlab > Gitlab 源连接器 ## 描述 用于从 Gitlab 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | |-----------------------------|---------|------|--------| | url | String | 是 | - | | access_token | String | 是 | - | | method | String | 否 | get | | schema.fields | Config | 否 | - | | format | String | 否 | json | | params | Map | 否 | - | | body | String | 否 | - | | json_field | Config | 否 | - | | content_json | String | 否 | - | | poll_interval_millis | int | 否 | - | | retry | int | 否 | - | | retry_backoff_multiplier_ms | int | 否 | 100 | | retry_backoff_max_ms | int | 否 | 10000 | | enable_multi_lines | boolean | 否 | false | | common-options | config | 否 | - | ### url [String] http 请求 url ### access_token [String] 个人访问令牌 ### method [String] http 请求方法,仅支持 GET、POST 方法 ### params [Map] http 参数 ### body [String] http 请求体 ### poll_interval_millis [int] 在流模式下请求 http api 的间隔(毫秒) ### retry [int] 如果 http 请求返回 `IOException` 的最大重试次数 ### retry_backoff_multiplier_ms [int] 如果 http 请求失败,重试退避时间(毫秒)乘数 ### retry_backoff_max_ms [int] 如果 http 请求失败,最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 当您指定格式为 `json` 时,您还应该指定 schema 选项,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 您应该指定 schema 如下: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 当您指定格式为 `text` 时,连接器将对上游数据不做任何处理,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 连接器将生成如下数据: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 json 数据。如果您只需要 'book' 部分中的数据,请配置 `content_field = "$.store.book.*"`。 如果您的返回数据看起来像这样。 ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 您可以配置 `content_field = "$.store.book.*"`,返回的结果看起来像这样: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` 然后您可以使用更简单的 schema 获得所需的结果,如 ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` 这是一个示例: - 测试数据可以在此链接找到 [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 查看此链接了解任务配置 [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf)。 ### json_field [Config] 此参数可帮助您配置 schema,因此此参数必须与 schema 一起使用。 如果您的数据看起来像这样: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 您可以通过配置任务如下来获取 'book' 的内容: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - 测试数据可以在此链接找到 [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 查看此链接了解任务配置 [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf)。 ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见 ## 示例 ```hocon Gitlab{ url = "https://gitlab.com/api/v4/projects" access_token = "xxxxx" schema { fields { id = int description = string name = string name_with_namespace = string path = string http_url_to_repo = string } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/GoogleSheets.md ================================================ import ChangeLog from '../changelog/connector-google-sheets.md'; # GoogleSheets > GoogleSheets 源连接器 ## 描述 用于从GoogleSheets读取数据. ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) - [ ] 文件格式 - [ ] text - [ ] csv - [ ] json ## 选项 | 名称 | 类型 | 必需 | 默认值 | |---------------------|--------|----------|---------------| | service_account_key | string | 是 | - | | sheet_id | string | 是 | - | | sheet_name | string | 是 | - | | range | string | 是 | - | | schema | config | 否 | - | ### service_account_key [string] 谷歌云服务帐户,需要base64编码 ### sheet_id [string] Google表格URL中的表格id ### sheet_name [string] 要导入的工作表的名称 ### range [string] 要导入的 sheet 页的范围 ### schema [config] #### fields [config] 上游数据的字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ## 示例 简单示例: ```hocon GoogleSheets { service_account_key = "seatunnel-test" sheet_id = "1VI0DvyZK-NIdssSdsDSsSSSC-_-rYMi7ppJiI_jhE" sheet_name = "sheets01" range = "A1:C3" schema = { fields { a = int b = string c = string } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/GraphQL.md ================================================ import ChangeLog from '../changelog/connector-graphql.md'; # GraphQL > GraphQL Source 连接器 ## 描述 用于读取GraphQL数据。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [并行](../../introduction/concepts/connector-v2-features.md) ## 源选项 | 名称 | 类型 | 是否必填 | 默认值 | | --------------------------- | ------- | -------- | ----------------------- | | url | String | Yes | - | | query | String | Yes | - | | variables | Config | No | - | | enable_subscription | boolean | No | false | | timeout | Long | No | - | | content_field | String | Yes | $.data.{query_object}.* | | schema.fields | Config | Yes | - | | params | Map | Yes | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | - | ### url [String] http 请求路径。 ### query [String] GraphQL 表达式查询字符串 ### variables [String] GraphQL 变量 比如 ``` variables = { limit = 2 } ``` ### enable_subscription [boolean] 1. true : 开启流式订阅模式(WebSocket) 2. false : 开启批处理查询模式(HTTP) ### timeout [Long] 超时时间 ### content_field [String] SONPath通配符 ### params [Map] HTTP请求参数 ### poll_interval_millis [int] 流模式下请求HTTP API间隔(毫秒) ### retry [int] 如果请求http返回到‘ IOException ’的最大重试次数 ### retry_backoff_multiplier_ms [int] 如果请求http失败,则重试回退时间(毫秒)倍率 ### retry_backoff_max_ms [int] 如果http请求失败,最大重试回退时间(毫秒) ### schema [Config] 填写一个固定值 ```hocon schema = { fields { metric = "map" value = double time = long } } ``` #### fields [Config] 上游数据的模式字段 ### common options 源插件常用参数,请参考 [Source Common Options](../source-common-options.md) 获取详细信息 ## 示例 ### Query ```hocon source { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" content_field = "$.data.source" query = """ query MyQuery($limit: Int) { source(limit: $limit) { id val_bool val_double val_float } } """ variables = { limit = 2 } schema = { fields { id = "int" val_bool = "boolean" val_double = "double" val_float = "float" } } } } ``` ### Subscription ```hocon source { GraphQL { url = "http://192.168.1.103:9081/v1/graphql" content_field = "$.data.source" query = """ query MyQuery($limit: Int) { source(limit: $limit) { id val_bool val_double val_float } } """ variables = { limit = 2 } enable_subscription = true schema = { fields { id = "int" val_bool = "boolean" val_double = "double" val_float = "float" } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Greenplum.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Greenplum > Greenplum 源连接器 ## 描述 通过 [Jdbc 连接器](Jdbc.md) 读取 Greenplum 数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) 支持查询 SQL 并可以实现投影效果。 - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) :::tip 可选的 jdbc 驱动程序: - `org.postgresql.Driver` - `com.pivotal.jdbc.GreenplumDriver` 警告:为了符合许可证要求,如果您使用 `GreenplumDriver`,必须自己提供 Greenplum JDBC 驱动程序,例如将 greenplum-xxx.jar 复制到 $SEATUNNEL_HOME/lib(用于独立模式)。 ::: ## 选项 ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Hbase.md ================================================ import ChangeLog from '../changelog/connector-hbase.md'; # Hbase > Hbase 源连接器 ## 描述 从 Apache Hbase 读取数据。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [Schema](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必填 | 默认值 | |----------------------|----------|----|-------| | zookeeper_quorum | string | 是 | - | | table | string | 是 | - | | schema | config | 是 | - | | hbase_extra_config | config | 否 | - | | caching | int | 否 | -1 | | batch | int | 否 | -1 | | cache_blocks | boolean | 否 | false | | is_binary_rowkey | boolean | 否 | false | | start_rowkey | string | 否 | - | | end_rowkey | string | 否 | - | | start_row_inclusive | boolean | 否 | true | | end_row_inclusive | boolean | 否 | false | | start_timestamp | long | 否 | - | | end_timestamp | long | 否 | - | | common-options | | 否 | - | ### zookeeper_quorum [string] hbase的zookeeper集群主机,例如:“hadoop001:2181,hadoop002:2181,hadoop003:2181” ### table [string] 要写入的表名,例如:“seatunnel” 如果表在自定义 namespace 下,请使用 `namespace:table` 形式(如 `ns1:seatunnel_test`);未填写 namespace 时,SeaTunnel 会使用 HBase 的默认命名空间 `default`。 ### schema [config] Hbase 使用字节数组进行存储。因此,您需要为表中的每一列配置数据类型。有关更多信息,请参阅:[guide](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported)。 ### hbase_extra_config [config] hbase 的额外配置 ### caching caching 参数用于设置在扫描过程中一次从服务器端获取的行数。这可以减少客户端与服务器之间的往返次数,从而提高扫描效率。默认值:-1 ### batch batch 参数用于设置在扫描过程中每次返回的最大列数。这对于处理有很多列的行特别有用,可以避免一次性返回过多数据,从而节省内存并提高性能。 ### cache_blocks cache_blocks 参数用于设置在扫描过程中是否缓存数据块。默认情况下,HBase 会在扫描时将数据块缓存到块缓存中。如果设置为 false,则在扫描过程中不会缓存数据块,从而减少内存的使用。在SeaTunnel中默认值为: false ### is_binary_rowkey HBase 的行键既可以是文本字符串,也可以是二进制数据。在 SeaTunnel 中,行键默认设置为文本字符串(即 is_binary_rowkey 默认值为 false) ### start_rowkey 扫描起始行 ### end_rowkey 扫描结束行 ### start_row_inclusive 设置扫描范围是否包含起始行。当设置为 true 时,扫描结果将包含起始行。默认值: true (包含)。 **注意:** 在大多数情况下,应保持默认值 (true)。仅当您有特定需求需要排除起始行时才修改此参数。 ### end_row_inclusive 设置扫描范围是否包含结束行。当设置为 false 时,扫描结果将不包含结束行,遵循左闭右开的区间约定 [start, end)。默认值: false (不包含)。 **注意:** 在大多数情况下,应保持默认值 (false),这遵循 HBase 标准的左闭右开区间约定。仅当您需要在扫描结果中包含结束行时才修改此参数。 **重要提示:** 在使用多个 split 并行读取时,这两个参数的组合对数据完整性至关重要: - **默认配置 (start_row_inclusive=true, end_row_inclusive=false)**: 这是推荐的配置,可以确保跨 split 时不会丢失数据或产生重复数据。每个 split 遵循 [start, end) 左闭右开区间约定。 - **都设置为 false (start_row_inclusive=false, end_row_inclusive=false)**: 这可能会导致**数据丢失**,因为边界行会被所有 split 排除在外。 - **都设置为 true (start_row_inclusive=true, end_row_inclusive=true)**: 这可能会导致**数据重复**,因为边界行会被相邻的多个 split 重复包含。 ### start_timestamp 时间范围扫描的起始时间戳(包含)。单位为毫秒(epoch)。时间范围遵循 [start, end) 左闭右开约定。如果只设置 start_timestamp,则最大值视为无限上界。 ### end_timestamp 时间范围扫描的结束时间戳(不包含)。单位为毫秒(epoch)。时间范围遵循 [start, end) 左闭右开约定。如果只设置 end_timestamp,则最小值视为无限下界。 **说明:** - `start_timestamp` / `end_timestamp` 必须大于等于 0;若两者同时配置,需要满足 `start_timestamp < end_timestamp`(遵循 [start, end) 约定,`start_timestamp == end_timestamp` 将导致空扫描)。 - 当 `start_rowkey` / `end_rowkey` 与 `start_timestamp` / `end_timestamp` 同时配置时,会同时应用行键范围与时间范围限制,最终返回两者的交集。 ### 常用选项 Source 插件常用参数,具体请参考 [Source 常用选项](../common-options/source-common-options.md) ## 示例 ```bash source { Hbase { zookeeper_quorum = "hadoop001:2181,hadoop002:2181,hadoop003:2181" table = "seatunnel_test" caching = 1000 batch = 100 cache_blocks = false is_binary_rowkey = false start_rowkey = "B" end_rowkey = "C" start_timestamp = 1700000000000 end_timestamp = 1700003600000 schema = { columns = [ { name = "rowkey" type = string }, { name = "columnFamily1:column1" type = boolean }, { name = "columnFamily1:column2" type = double }, { name = "columnFamily2:column1" type = bigint } ] } } } ``` ## Kerberos 示例 备注: - `connector-hbase` 不会解析 `krb5_path` / `kerberos_principal` / `kerberos_keytab_path`。 - 需要在运行环境中提前完成 Kerberos 登录并保证 `krb5.conf` 可被 JVM 访问(例如 `kinit -kt ...` 或 JVM `-Djava.security.krb5.conf=...`),同时将 HBase/Hadoop 的安全配置写入 `hbase_extra_config`。 ```hocon source { Hbase { zookeeper_quorum = "zk1:2181,zk2:2181,zk3:2181" table = "source_table" caching = 1000 batch = 200 cache_blocks = false is_binary_rowkey = false # HBase安全配置 hbase_extra_config = { "hbase.security.authentication" = "kerberos" "hadoop.security.authentication" = "kerberos" "hbase.master.kerberos.principal" = "hbase/_HOST@REALM" "hbase.regionserver.kerberos.principal" = "hbase/_HOST@REALM" "hbase.rpc.protection" = "authentication" "hbase.zookeeper.useSasl" = "false" } schema = { columns = [ { name = "rowkey", type = string }, { name = "info:name", type = string }, { name = "info:score", type = string } ] } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/HdfsFile.md ================================================ import ChangeLog from '../changelog/connector-file-hadoop.md'; # HdfsFile > Hdfs 文件数据源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在 pollNext 调用中读取分片中的所有数据。读取的分片将保存在快照中。 - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义分片](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表读](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 从 hdfs 文件系统读取数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | |--------|------------------| | HdfsFile | hadoop 2.x 和 3.x | ## 数据源选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |----------------------------|---------|------|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | 源文件路径。 | | file_format_type | string | 是 | - | 我们支持以下文件类型:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown`。请注意,最终文件名将以文件格式的后缀结束,文本文件的后缀是 `txt`。 | | fs.defaultFS | string | 是 | - | 以 `hdfs://` 开头的 hadoop 集群地址,例如:`hdfs://hadoopcluster` | | read_columns | list | 否 | - | 数据源的读取列列表,用户可以使用它来实现字段投影。支持列投影的文件类型如下所示:[text,json,csv,orc,parquet,excel,xml]。提示:如果用户想在读取 `text` `json` `csv` 文件时使用此功能,必须配置 schema 选项。 | | hdfs_site_path | string | 否 | - | `hdfs-site.xml` 的路径,用于加载 namenodes 的 ha 配置 | | delimiter/field_delimiter | string | 否 | \001 | 字段分隔符,用于告诉连接器在读取文本文件时如何分割字段。默认 `\001`,与 hive 的默认分隔符相同 | | row_delimiter | string | 否 | \n | 行分隔符,用于告诉连接器在读取文本文件时如何分割行。默认 `\n`。 | | parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径解析分区键和值。例如,如果您从路径 `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` 读取文件。文件中的每条记录数据都将添加这两个字段:[name:tyrantlucifer,age:26]。提示:不要在 schema 选项中定义分区字段。 | | date_format | string | 否 | yyyy-MM-dd | 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` 默认 `yyyy-MM-dd`。日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` 默认 `yyyy-MM-dd` | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`。默认 `yyyy-MM-dd HH:mm:ss` | | time_format | string | 否 | HH:mm:ss | 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式:`HH:mm:ss` `HH:mm:ss.SSS`。默认 `HH:mm:ss` | | remote_user | string | 否 | - | 用于连接到 hadoop 登录名的登录用户。它旨在用于 RPC 中的远程用户,不会有任何凭据。 | | krb5_path | string | 否 | /etc/krb5.conf | kerberos 的 krb5 路径 | | kerberos_principal | string | 否 | - | kerberos 的主体 | | kerberos_keytab_path | string | 否 | - | kerberos 的 keytab 路径 | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于 txt 和 csv。例如,设置如下:`skip_header_row_number = 2`。然后 Seatunnel 将跳过源文件的前 2 行 | | schema | config | 否 | - | 上游数据的 schema 字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | sheet_name | string | 否 | - | 读取工作簿的工作表,仅在 file_format 为 excel 时使用。 | | xml_row_tag | string | 否 | - | 指定 XML 文件中数据行的标签名称,仅在 file_format 为 xml 时使用。 | | xml_use_attr_format | boolean | 否 | - | 指定是否使用标签属性格式处理数据,仅在 file_format 为 xml 时使用。 | | csv_use_header_line | boolean | 否 | false | 是否使用标题行解析文件,仅在 file_format 为 `csv` 且文件包含符合 RFC 4180 的标题行时使用 | | file_filter_pattern | string | 否 | | 过滤模式,用于过滤文件。 | | filename_extension | string | 否 | - | 过滤文件扩展名,用于过滤具有特定扩展名的文件。示例:`csv` `.txt` `json` `.xml`。 | | compress_codec | string | 否 | none | 文件的压缩编解码器 | | archive_compress_codec | string | 否 | none | | | encoding | string | 否 | UTF-8 | | | null_format | string | 否 | - | 仅在 file_format_type 为 text 时使用。null_format 定义哪些字符串可以表示为 null。例如:`\N` | | binary_chunk_size | int | 否 | 1024 | 仅在 file_format_type 为 binary 时使用。读取二进制文件的块大小(以字节为单位)。默认为 1024 字节。较大的值可能会提高大文件的性能,但会使用更多内存。 | | binary_complete_file_mode | boolean | 否 | false | 仅在 file_format_type 为 binary 时使用。是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为 false。 | | sync_mode | string | 否 | full | 文件同步模式,支持:`full`(默认)、`update`。当 `update` 时,对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 | | target_path | string | 否 | - | 仅在 `sync_mode=update` 时使用。目标端基础路径(通常应与 sink 的 `path` 一致),用于对比同相对路径文件。 | | target_hadoop_conf | map | 否 | - | 仅在 `sync_mode=update` 时使用。目标端 Hadoop 配置(可选),可在其中设置 `fs.defaultFS` 覆盖目标 defaultFS。 | | update_strategy | string | 否 | distcp | 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)、`strict`。 | | compare_mode | string | 否 | len_mtime | 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)、`checksum`(仅在 `update_strategy=strict` 时可用)。 | | common-options | | 否 | - | 数据源插件通用参数,请参阅 [数据源通用选项](../source-common-options.md) 了解详情。 | | file_filter_modified_start | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | file_filter_modified_end | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | enable_file_split | boolean | 否 | false | 开启大文件拆分以提升并行度。仅支持 `text`/`csv`/`json`/`parquet` 且非压缩格式(`compress_codec=none` 且 `archive_compress_codec=none`)。 | | file_split_size | long | 否 | 134217728 | `enable_file_split=true` 时生效,单位字节。`text`/`csv`/`json` 按 `file_split_size` 拆分并对齐到下一个 `row_delimiter`;`parquet` 以 RowGroup 为拆分单位,不会切开 RowGroup。 | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | | metalake_type | string | 否 | gravitino | Metalake 服务类型,目前支持 `gravitino`。 | ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 ### delimiter/field_delimiter [string] **delimiter** 参数将在 2.3.5 版本后弃用,请使用 **field_delimiter** 代替。 ### row_delimiter [string] 仅在 file_format 为 text 时需要配置。 行分隔符,用于告诉连接器如何分割行。 默认 `\n`。 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考 https://en.wikipedia.org/wiki/Regular_expression。 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例 1**:*匹配所有 .txt 文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果是: ``` /data/seatunnel/20241001/report.txt ``` **示例 2**:*匹配所有以 abc 开头的文件*,正则表达式: ``` abc.* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例 3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例 4**:*匹配以 202410 开头的第三级文件夹和以 .csv 结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### compress_codec [string] 文件的压缩编解码器及其支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器及其支持的详细信息如下所示: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|-------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz 压缩的 excel 文件需要压缩原始文件或指定文件后缀,例如 e2e.xls ->e2e_test.xls.gz ### encoding [string] 仅在 file_format_type 为 json,text,csv,xml 时使用。 要读取的文件的编码。此参数将由 `Charset.forName(encoding)` 解析。 ### binary_chunk_size [int] 仅在 file_format_type 为 binary 时使用。 读取二进制文件的块大小(以字节为单位)。默认为 1024 字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在 file_format_type 为 binary 时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为 false。 ### sync_mode [string] 文件同步模式,支持:`full`(默认)`update`。 当 `sync_mode=update` 时,会在读取端对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 ### target_path [string] 仅在 `sync_mode=update` 时使用。 目标端基础路径(通常应与 sink 的 `path` 保持一致),用于对比同相对路径的目标文件是否存在/是否需要更新。 ### target_hadoop_conf [map] 仅在 `sync_mode=update` 时使用。 用于访问目标文件系统的 Hadoop 配置(可选)。当不配置时默认复用 source 端的文件系统配置。 可在该 map 中指定 `fs.defaultFS` 来覆盖目标端 defaultFS,例如:`"fs.defaultFS" = "hdfs://nn2:9000"`。 ### update_strategy [string] 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)`strict`。 - `distcp`:更接近 `distcp -update` 的语义: - 目标文件不存在 → COPY - 长度不同 → COPY - `mtime(source) > mtime(target)` → COPY - 否则 → SKIP - `strict`:严格一致性,配合 `compare_mode` 判断是否 SKIP。 ### compare_mode [string] 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)`checksum`。 - `len_mtime`:`len` 与 `mtime` 都相同才 SKIP,否则 COPY。 - `checksum`:要求 `len` 相同且 Hadoop `getFileChecksum` 相同才 SKIP,否则 COPY(仅在 `update_strategy=strict` 时生效)。 ### enable_file_split [boolean] 开启大文件拆分功能,默认 false。仅支持 `csv`/`text`/`json`/`parquet` 且非压缩格式(`compress_codec=none` 且 `archive_compress_codec=none`)。 - `text`/`csv`/`json`:按 `file_split_size` 拆分并对齐到下一个 `row_delimiter`,避免切开一行/一条记录。 - `parquet`:以 RowGroup 为逻辑拆分单位,不会切开 RowGroup。 **使用建议** - 适合:读取少量大文件,并希望通过更高并行度提升吞吐。 - 不建议:读取大量小文件,或并行度较低的场景(拆分会带来额外的枚举/调度开销)。 **限制说明** - 不支持压缩文件(`compress_codec` != `none`)或归档文件(`archive_compress_codec` != `none`),会自动回退为不拆分。 - 对于 `text`/`csv`/`json`,实际 split 的大小可能略大于 `file_split_size`(因为需要对齐到下一个 `row_delimiter`)。 ### file_split_size [long] `enable_file_split=true` 时生效,单位字节。默认 128MB(134217728)。 **调优建议** - 建议从默认值(128MB)开始:如果并行度未充分利用可适当调小;如果 split 数量过多可适当调大。 - 经验公式:`file_split_size ≈ file_size / 期望并行度`。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### schema [config] 仅在文件格式类型为 text、json、excel、xml 或 csv(或其他无法从元数据中读取 schema 的格式)时需要配置。 上游数据的 schema 信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 ### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ### 提示 > 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已经集成了 hadoop。测试过的 hadoop 版本是 2.x。如果您使用 SeaTunnel Engine,则在下载和安装 SeaTunnel Engine 时会自动集成 hadoop jar。您可以检查 `${SEATUNNEL_HOME}/lib` 下的 jar 包来确认这一点。 ## 任务示例 ### 简单示例 > 此示例定义了一个 SeaTunnel 同步任务,从 Hdfs 读取数据并将其发送到 Hdfs。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" } # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的数据源插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/source } transform { # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format_type = "orc" } # 如果您想获取有关如何配置 seatunnel 的更多信息和查看完整的接收器插件列表, # 请访问 https://seatunnel.apache.org/docs/connectors/sink } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ### 多表配置 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { HdfsFile { tables_configs = [ { schema = { table = "student" } path = "/apps/hive/demo/student" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" }, { schema = { table = "teacher" } path = "/apps/hive/demo/teacher" file_format_type = "json" fs.defaultFS = "hdfs://namenode001" } ] } } sink { HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/${table_name}" file_format_type = "orc" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Hive.md ================================================ import ChangeLog from '../changelog/connector-hive.md'; # Hive > Hive 源连接器 ## 描述 从 Hive 读取数据。 使用 markdown 格式时,SeaTunnel 可以解析存储在 Hive 表中的 markdown 文件并提取结构化数据,包括标题、段落、列表、代码块和表格等元素。每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 :::tip 提示 为了使用此连接器,您必须确保您的 Spark/Flink 集群已经集成了 Hive。测试过的 Hive 版本是 2.3.9 和 3.1.3。 如果您使用 SeaTunnel 引擎,您需要将 `seatunnel-hadoop3-3.1.4-uber.jar`、`hive-exec-3.1.3.jar` 和 `libfb303-0.9.3.jar` 放在 `$SEATUNNEL_HOME/lib/` 目录中。 ::: ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在 `pollNext` 调用中读取分片中的所有数据。读取的分片将保存在快照中。 - [x] [schema 投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式 - [x] 文本 - [x] CSV - [x] Parquet - [x] ORC - [x] JSON - [x] markdown ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------------|--------|------|---------| | table_name | string | 是 | - | | use_regex | boolean| 否 | false | | metastore_uri | string | 是 | - | | krb5_path | string | 否 | /etc/krb5.conf | | kerberos_principal | string | 否 | - | | kerberos_keytab_path | string | 否 | - | | hdfs_site_path | string | 否 | - | | hive_site_path | string | 否 | - | | hive.hadoop.conf | Map | 否 | - | | hive.hadoop.conf-path | string | 否 | - | | read_partitions | list | 否 | - | | read_columns | list | 否 | - | | compress_codec | string | 否 | none | | common-options | | 否 | - | ### table_name [string] 目标 Hive 表名,例如:`db1.table1`。当 `use_regex = true` 时,该字段支持 `数据库正则.表正则`(Hive 没有 schema)来匹配 Hive 元存储中的多张表。 ### use_regex [boolean] 是否将 `table_name` 视为正则表达式进行匹配。开启后,`table_name` 可用于整库/多表同步;同样也支持在 `table_list` / `tables_configs` 的每个表配置里单独开启。 语法说明: - 点号(`.`)被视为数据库与表之间的分隔符(Hive 仅支持 `database.table`)。 - 只允许出现 1 个未转义的点号(`.`)(作为数据库/表分隔符)。如果需要在正则表达式中使用点号(`.`)(例如 `.*`),必须写成 `\.`(HOCON 字符串里需要写成 `\\.`)。 - 例如:`db0.\.*`、`db1.user_table_[0-9]+`、`db[1-2].(app|web)order_\.*`。 - 在 SeaTunnel 作业配置(HOCON 字符串)中,反斜杠需要再次转义。例如正则 `db0.\.*` 在配置中应写成 `db0.\\.*`。 - `db0.\.*` 表示同步 `db0` 库下的所有表(整库同步)。 - `\.*.\.*` 表示同步所有库下的所有表(整 Hive 同步)。 ### metastore_uri [string] Hive 元存储 URI。支持通过逗号分隔配置多个 URI 用于高可用/故障切换(会自动去除空格)。SeaTunnel 会将该值写入 Hive 的 `hive.metastore.uris`,并在运行时优先使用 Hive 的 `RetryingMetaStoreClient` 实现重试/切换。注意:该能力仅做客户端连接端点切换,元数据一致性需要由 metastore 部署保证。 ### hdfs_site_path [string] `hdfs-site.xml` 的路径,用于加载 Namenode 的高可用配置 ### hive.hadoop.conf [map] Hadoop 配置中的属性(`core-site.xml`、`hdfs-site.xml`、`hive-site.xml`) ### hive.hadoop.conf-path [string] 指定加载 `core-site.xml`、`hdfs-site.xml`、`hive-site.xml` 文件的路径 ### read_partitions [list] 用户希望从 Hive 表中读取的目标分区,如果用户未设置此参数,将读取 Hive 表中的所有数据。 **提示:分区列表中的每个分区应具有相同的目录层级。例如,一个 Hive 表有两个分区:`par1` 和 `par2`,如果用户设置如下:** **`read_partitions = [par1=xxx, par1=yyy/par2=zzz]`,这是不合法的** ### krb5_path [string] `krb5.conf` 的路径,用于 Kerberos 认证 ### kerberos_principal [string] Kerberos 认证的主体 ### kerberos_keytab_path [string] Kerberos 认证的 keytab 文件路径 ### read_columns [list] 数据源的读取列列表,用户可以使用它来实现字段投影。 ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### 通用选项 源插件的通用参数,请参阅 [Source Common Options](../common-options/source-common-options.md) 了解详细信息。 ## 示例 ### 示例 1:单表 ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://namenode001:9083" } ``` ### 示例 2:metastore_uri 故障切换(多 URI) ```bash Hive { table_name = "default.seatunnel_orc" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" } ``` ### 示例 3:多表 > 注意:Hive 是结构化数据源,应使用 `table_list`,`tables_configs` 将在未来移除。 > 也支持在每个表配置中设置 `use_regex = true` 来按正则匹配多表。 ```bash Hive { table_list = [ { table_name = "default.seatunnel_orc_1" metastore_uri = "thrift://namenode001:9083" }, { table_name = "default.seatunnel_orc_2" metastore_uri = "thrift://namenode001:9083" } ] } ``` ```bash Hive { tables_configs = [ { table_name = "default.seatunnel_orc_1" metastore_uri = "thrift://namenode001:9083" }, { table_name = "default.seatunnel_orc_2" metastore_uri = "thrift://namenode001:9083" } ] } ``` ### 示例 3:正则匹配多表(整库/整库子集) ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 1) 整库同步:同步 `a` 库下的所有表 table_name = "a.\\.*" use_regex = true } ``` ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 2) 整 Hive 同步:同步所有库下的所有表 table_name = "\\.*.\\.*" use_regex = true } ``` ```bash Hive { metastore_uri = "thrift://namenode001:9083" # 3) 整库子集:同步 `a` 库下,表名匹配 `tmp_.*` 的表 # 注意:`.*` 里的点号(`.`)必须写成 `\.`(HOCON 字符串里写 `\\.`),因为未转义的点号会被当作分隔符 table_name = "a.tmp_\\.*" use_regex = true } ``` ### 示例 4:Kerberos ```bash source { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive.hadoop.conf-path = "/tmp/hadoop" plugin_output = hive_source hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } ``` 描述: - `hive_site_path`:`hive-site.xml` 文件的路径。 - `kerberos_principal`:Kerberos 认证的主体。 - `kerberos_keytab_path`:Kerberos 认证的 keytab 文件路径。 - `krb5_path`:用于 Kerberos 认证的 `krb5.conf` 文件路径。 运行案例: ```bash env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "default.test_hive_sink_on_hdfs_with_kerberos" metastore_uri = "thrift://metastore:9083" hive.hadoop.conf-path = "/tmp/hadoop" plugin_output = hive_source hive_site_path = "/tmp/hive-site.xml" kerberos_principal = "hive/metastore.seatunnel@EXAMPLE.COM" kerberos_keytab_path = "/tmp/hive.keytab" krb5_path = "/tmp/krb5.conf" } } sink { Assert { plugin_input = hive_source rules { row_rules = [ { rule_type = MAX_ROW rule_value = 3 } ], field_rules = [ { field_name = pk_id field_type = bigint field_value = [ { rule_type = NOT_NULL } ] }, { field_name = name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = score field_type = int field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ## Hive on s3 ### 步骤 1 为 EMR 的 Hive 创建 lib 目录。 ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 2 从 Maven 中心获取 jar 文件到 lib。 ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.6.5/hadoop-aws-2.6.5.jar wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### 步骤 3 从您的 EMR 环境中复制 jar 文件到 lib 目录。 ```shell cp /usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.60.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/hadoop-common-3.3.6-amzn-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/javax.inject-1.jar ${SEATUNNEL_HOME}/plugins/Hive/lib cp /usr/share/aws/emr/hadoop-state-pusher/lib/aopalliance-1.0.jar ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 4 运行案例。 ```shell env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "test_hive.test_hive_sink_on_s3" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } read_columns = ["pk_id", "name", "score"] } } sink { Hive { table_name = "test_hive.test_hive_sink_on_s3_sink" metastore_uri = "thrift://ip-192-168-0-202.cn-north-1.compute.internal:9083" hive.hadoop.conf-path = "/home/ec2-user/hadoop-conf" hive.hadoop.conf = { bucket="s3://ws-package" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" } } } ``` ## Hive on oss ### 步骤 1 为 EMR 的 Hive 创建 lib 目录。 ```shell mkdir -p ${SEATUNNEL_HOME}/plugins/Hive/lib ``` ### 步骤 2 从 Maven 中心获取 jar 文件到 lib。 ```shell cd ${SEATUNNEL_HOME}/plugins/Hive/lib wget https://repo1.maven.org/maven2/org/apache/hive/hive-exec/2.3.9/hive-exec-2.3.9.jar ``` ### 步骤 3 从您的 EMR 环境中复制 jar 文件到 lib 目录并删除冲突的 jar。 ```shell cp -r /opt/apps/JINDOSDK/jindosdk-current/lib/jindo-*.jar ${SEATUNNEL_HOME}/plugins/Hive/lib rm -f ${SEATUNNEL_HOME}/lib/hadoop-aliyun-*.jar ``` ### 步骤 4 运行案例。 ```shell env { parallelism = 1 job.mode = "BATCH" } source { Hive { table_name = "test_hive.test_hive_sink_on_oss" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } sink { Hive { table_name = "test_hive.test_hive_sink_on_oss_sink" metastore_uri = "thrift://master-1-1.c-1009b01725b501f2.cn-wulanchabu.emr.aliyuncs.com:9083" hive.hadoop.conf-path = "/tmp/hadoop" hive.hadoop.conf = { bucket="oss://emr-osshdfs.cn-wulanchabu.oss-dls.aliyuncs.com" } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/HiveJdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # HiveJdbc > JDBC Hive 源连接器 ## 支持Hive版本 - 确定支持3.1.3和3.1.2,其他版本需要测试。 ## 超时参数支持 `socket_timeout_ms` 和 `connect_timeout_ms` 参数已在 **Hive 3.2.0+** 版本上测试验证。对于更早的版本(包括 3.1.x),这些参数暂未验证。参数会被传递给 JDBC 驱动,但实际效果取决于使用的 Hive 版本。 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) > 支持查询SQL,可以实现投影效果。 ## 描述 通过JDBC读取外部数据源数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |------|----------------------------------------------------------|---------------------------------|--------------------------------------|--------------------------------------------------------------------------| | Hive | 不同的依赖版本有不同的驱动程序类。 | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000/default | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc) | ## 数据库相关性 > 请下载“Maven”对应的支持列表,并将其复制到"$SEATUNNEL_HOME/plugins/jdbc/lib/" > 工作目录
    > 例如,Hive数据源:cp Hive-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | Hive 数据类型 | SeaTunnel 数据类型 | |-------------------------------------------------------------------------------------------|-------------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT | SHORT | | INT
    INTEGER | INT | | BIGINT | LONG | | FLOAT | FLOAT | | DOUBLE
    DOUBLE PRECISION | DOUBLE | | DECIMAL(x,y)
    NUMERIC(x,y)
    (Get the designated column's specified column size.<38) | DECIMAL(x,y) | | DECIMAL(x,y)
    NUMERIC(x,y)
    (Get the designated column's specified column size.>38) | DECIMAL(38,18) | | CHAR
    VARCHAR
    STRING | STRING | | DATE | DATE | | DATETIME
    TIMESTAMP | TIMESTAMP | | BINARY
    ARRAY
    INTERVAL
    MAP
    STRUCT
    UNIONTYPE | Not supported yet | ## 源配置项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |------------------------------|------------|----|-----------------|-----------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC连接的URL。参考示例: jdbc:hive2://localhost:10000/default | | driver | String | 是 | - | 用于连接到远程数据源的jdbc类名,
    如果使用Hive,则值为 `org.apache.hive.jdbc.HiveDriver`. | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询sql | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | socket_timeout_ms | Int | 否 | 86400000 | 从服务器读取数据的 Socket 超时时间(毫秒)。设置为 0 表示无超时。注意:已在 Hive 3.2.0+ 测试,更早版本暂未验证。 | | connect_timeout_ms | Int | 否 | 86400000 | 建立到服务器的连接超时时间(毫秒)。设置为 0 表示无超时。注意:已在 Hive 3.2.0+ 测试,更早版本暂未验证。 | | partition_column | String | 否 | - | 并行分区的列名,只支持数值类型,只支持数字类型主键,只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | 扫描的分区列最小值,如果未设置,SeaTunnel将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | 扫描的分区列最大值,如果没有设置,SeaTunnel将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分区数量,仅支持正整数。 默认值是作业并行数 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,通过减少满足选择条件所需的数据库查询次数来提高性能。0表示使用jdbc默认值。 | | common-options | | 否 | - | 源插件常用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见 | | use_kerberos | Boolean | 否 | no | 是否启用Kerberos,默认值为false | | kerberos_principal | String | 否 | - | 使用kerberos时,我们应该设置kerberos主体,例如"test_user@xxx". | | kerberos_keytab_path | String | 否 | - | 使用kerberos时,我们应该设置kerberos主体文件路径,如“/home/test/test_user.keytab”。 | | krb5_path | String | 否 | /etc/krb5.conf | 使用kerberos时,我们应该设置krb5路径文件路径,如“/seatunnel/krb5.conf”,或使用默认路径“/etc/krb5.conf”。 | ### 提示 >如果未设置partition_column,它将以单并发运行,如果设置了partition_column,它将根据任务的并发性并行执行。当您的分片读取字段是bigint(及以上)等大数字类型并且数据分布不均匀时,建议将并行级别设置为1,以确保 数据倾斜问题已得到解决 ## 任务示例 ### 简单任务 >此示例以单并行方式查询测试数据库中表type_bin的16条数据,并查询其所有字段。您还可以指定要查询哪些字段以将最终输出到控制台。 ``` # 定义运行时环境 env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 query = "select * from type_bin limit 16" } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 并行任务 > 与您配置的分片字段和分片数据并行读取查询表如果您想读取整个表,可以这样做 ``` source { Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 # Define query logic as required query = "select * from type_bin" # Parallel sharding reads fields partition_column = "id" # Number of fragments partition_num = 10 } } ``` ### 并行度临界值 > 指定并行度的值在分区字段的值上下界之间,这样可以更高效的读取数据 ``` source { Jdbc { url = "jdbc:hive2://localhost:10000/default" driver = "org.apache.hive.jdbc.HiveDriver" connection_check_timeout_sec = 100 # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 } } ``` ## 修改日志 ================================================ FILE: docs/zh/connectors/source/Http.md ================================================ import ChangeLog from '../changelog/connector-http.md'; # Http > Http 源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 描述 用于从 Http 读取数据。 ## 支持的数据源信息 为了使用 Http 连接器,需要以下依赖项。 可以通过 install-plugin.sh 或从 Maven 中央仓库下载。 | 数据源 | 支持的版本 | 依赖 | |--------|------------|------| | Http | 通用 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-http) | ## 源选项 | 名称 | 类型 | 是否必须 | 默认值 | 描述 | |-------------------------------|---------|----------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | Http 请求 URL。 | | schema | Config | 否 | - | Http 和 seatunnel 数据结构映射。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | schema.fields | Config | 否 | - | 上游数据的 schema 字段 | | json_field | Config | 否 | - | 此参数帮助您配置 schema,因此此参数必须与 schema 一起使用。 | | pageing | Config | 否 | - | 此参数用于分页查询 | | pageing.page_field | String | 否 | - | 此参数用于指定请求中的页面字段名称。它可以在 headers、params 或 body 中使用占位符,如 ${page_field}。 | | pageing.use_placeholder_replacement | Boolean | 否 | false | 如果为 true,则使用占位符替换(${field})用于 headers、parameters 和 body 值,否则使用基于键的替换。 | | pageing.total_page_size | Int | 否 | - | 此参数用于控制总页数 | | pageing.batch_size | Int | 否 | - | 每个请求返回的批量大小,用于在总页数未知时确定是否继续 | | pageing.start_page_number | Int | 否 | 1 | 指定同步开始的页码 | | pageing.page_type | String | 否 | PageNumber | 此参数用于指定页面类型,如果未设置则为 PageNumber,仅支持 `PageNumber` 和 `Cursor`。 | | pageing.cursor_field | String | 否 | - | 此参数用于指定请求参数中的游标字段名称。 | | pageing.cursor_response_field | String | 否 | - | 此参数指定从中检索游标的响应字段。 | | content_field | String | 否 | - | 此参数可以获取一些 json 数据。如果您只需要 'book' 部分的数据,配置 `content_field = "$.store.book.*"`。 | | format | String | 否 | text | 上游数据的格式,目前仅支持 `json` `text`,默认为 `text`。 | | method | String | 否 | get | Http 请求方法,仅支持 GET、POST 方法。 | | headers | Map | 否 | - | Http 头信息。 | | params | Map | 否 | - | Http 参数。 | | body | String | 否 | - | Http 请求体,程序将自动添加 http header application/json,body 是 jsonbody。 | | poll_interval_millis | Int | 否 | - | 流模式下请求 http api 的间隔(毫秒)。 | | retry | Int | 否 | - | 如果请求 http 返回 `IOException` 的最大重试次数。 | | retry_backoff_multiplier_ms | Int | 否 | 100 | 请求 http 失败时的重试退避时间(毫秒)乘数。 | | retry_backoff_max_ms | Int | 否 | 10000 | 请求 http 失败时的最大重试退避时间(毫秒) | | enable_multi_lines | Boolean | 否 | false | | | connect_timeout_ms | Int | 否 | 12000 | 连接超时设置,默认 12 秒。 | | socket_timeout_ms | Int | 否 | 60000 | Socket 超时设置,默认 60 秒。 | | common-options | | 否 | - | 源插件通用参数,请参考 [Source Common Options](../common-options/source-common-options.md) 获取详细信息 | | keep_params_as_form | Boolean | 否 | false | 是否按照表单提交参数,用于兼容旧行为。当为 true 时,params 参数的值通过表单提交。 | | keep_page_param_as_http_param | Boolean | 否 | false | 是否将分页参数设置为 params。用于兼容旧行为。 | | json_filed_missed_return_null | Boolean | 否 | false | 当 JSON 字段缺失时,设置为 true 并返回 null,否则返回错误。| ## 如何创建 Http 数据同步作业 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Http { plugin_output = "http" url = "http://mockserver:1080/example/http" method = "GET" format = "json" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } } # 控制台打印读取的 Http 数据 sink { Console { parallelism = 1 } } ``` ## 参数解释 ### format 当您指定 format 为 `json` 时,您还应该指定 schema 选项,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 您应该指定 schema 如下: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 当您指定 format 为 `text` 时,连接器不会对上游数据做任何处理,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 连接器将生成如下数据: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### keep_params_as_form 为了兼容旧版本的 http。 当设置为 true 时,`` 和 `` 将以表单形式提交。 当设置为 false 时,`` 将添加到 url 路径中,而 `` 不会添加到 body 或表单中。它将替换 params 和 body 中的占位符。 ### keep_page_param_as_http_param 是否将分页参数设置为 params。 当设置为 true 时,`` 设置为 ``。 当设置为 false 时,当页面字段存在于 `` 或 `` 中时,替换值。 当设置为 false 时,配置示例: ```hocon body="""{"id":1,"page":"${page}"}""" ``` ```hocon params={ page: "${page}" } ``` ### params 默认情况下,参数将添加到 url 路径中。 如果您需要保持旧版本行为,请检查 keep_params_as_form。 ### body HTTP body 用于在请求或响应中携带实际数据,包括 JSON、表单提交。 参考格式如下: ```hocon body="{"id":1,"name":"seatunnel"}" ``` 对于表单提交,请按如下设置 content-type。 ```hocon headers { Content-Type = "application/x-www-form-urlencoded" } ``` ### content_field 此参数可以获取一些 json 数据。如果您只需要 'book' 部分的数据,配置 `content_field = "$.store.book.*"`。 如果您的返回数据看起来像这样。 ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 您可以配置 `content_field = "$.store.book.*"` 并且返回的结果看起来像这样: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` 然后您可以使用更简单的 schema 获取所需的结果,如 ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` 这里是一个示例: - 测试数据可以在此链接找到 [mockserver-config.json](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置请参考此链接 [http_contentjson_to_assert.conf](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf)。 ### json_field 此参数帮助您配置 schema,因此此参数必须与 schema 一起使用。 如果您的数据看起来像这样: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 您可以通过如下配置任务来获取 'book' 的内容: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - 测试数据可以在此链接找到 [mockserver-config.json](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置请参考此链接 [http_jsonpath_to_assert.conf](seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf)。 ### pageing 当前支持的分页类型是 `PageNumber` 和 `Cursor`。 如果您需要使用分页,您需要配置 `pageing`。默认分页类型是 `PageNumber`。 #### 1. PageNumber 使用 `PageNumber` 分页时,您可以在 HTTP 请求的不同部分包含页面参数: - **在 URL 参数中**:将页面参数添加到 `params` 部分 - **在请求体中**:在 `body` JSON 中包含页面参数 - **在头信息中**:将页面参数添加到 `headers` 部分 您可以使用占位符如 `${page}` 与 `use_placeholder_replacement = true` 来动态更新这些值。占位符可以以各种格式使用: - 作为独立值:`"${page}"` - 带前缀/后缀:`"10${page}"` 或 `"page-${page}"` - 作为不带引号的数字:`${page}`(在 JSON 体中) - 在嵌套 JSON 结构中:`{"pagination":{"page":${page}}}` ##### 示例 1:在 body 和 params 中使用页面参数 ```hocon source { Http { url = "http://localhost:8080/mock/queryData" method = "POST" format = "json" body="""{"id":1,"page":"${page}"}""" content_field = "$.data.*" params={ page: "${page}" } pageing={ #你可以不设置此参数,默认值是 PageNumber page_type="PageNumber" total_page_size=20 page_field=page use_placeholder_replacement=true #当不知道 total_page_size 时使用 batch_size,如果读取大小 ================================================ FILE: docs/zh/connectors/source/Iceberg.md ================================================ import ChangeLog from '../changelog/connector-iceberg.md'; # Apache Iceberg > Apache Iceberg 源连接器 ## 支持 Iceberg 版本 - 1.6.1 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) - [x] 数据格式 - [x] parquet - [x] orc - [x] avro - [x] iceberg 目录 - [x] hadoop(2.7.1 , 2.7.5 , 3.1.3) - [x] hive(2.3.9 , 3.1.2) ## 描述 Apache Iceberg 的源连接器。它可以支持批处理和流模式。 ## 支持的数据源信息 | 数据源 | 依赖 | Maven | |--------|------|---------------------------------------------------------------------------| | Iceberg | hive-exec | [下载](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) | | Iceberg | libfb303 | [下载](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) | ## 数据库依赖 > 为了与不同版本的 Hadoop 和 Hive 兼容,项目 pom 文件中 hive-exec 的范围是 provided,所以如果您使用 Flink 引擎,首先您可能需要将以下 Jar 包添加到 /lib 目录,如果您使用 Spark 引擎并与 Hadoop 集成,则不需要添加以下 Jar 包。如果您使用 hadoop s3 目录,您需要为您的 Flink 和 Spark 引擎版本添加 hadoop-aws、aws-java-sdk jars。(其他位置:/lib、/jars) ``` hive-exec-xxx.jar libfb303-xxx.jar ``` > hive-exec 包的某些版本没有 libfb303-xxx.jar,所以您还需要手动导入 Jar 包。 ## 数据类型映射 | Iceberg 数据类型 | SeaTunnel 数据类型 | |-------------------|---------------------| | BOOLEAN | BOOLEAN | | INTEGER | INT | | LONG | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | DATE | DATE | | TIME | TIME | | TIMESTAMP | TIMESTAMP | | STRING | STRING | | FIXED
    BINARY | BYTES | | DECIMAL | DECIMAL | | STRUCT | ROW | | LIST | ARRAY | | MAP | MAP | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------------------------|---------|------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | catalog_name | string | 是 | - | 用户指定的目录名称。 | | namespace | string | 是 | - | 后端目录中的 iceberg 数据库名称。 | | table | string | 否 | - | 后端目录中的 iceberg 表名称。 | | table_list | string | 否 | - | 后端目录中的 iceberg 表列表。 | | iceberg.catalog.config | map | 是 | - | 指定初始化 Iceberg 目录的属性,可以在此文件中引用:[CatalogProperties.java](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/CatalogProperties.java) | | hadoop.config | map | 否 | - | 传递给 Hadoop 配置的属性 | | iceberg.hadoop-conf-path | string | 否 | - | 为 'core-site.xml'、'hdfs-site.xml'、'hive-site.xml' 文件指定的加载路径。 | | schema | config | 否 | - | 使用投影来选择数据列和列顺序。 | | case_sensitive | boolean | 否 | false | 如果通过 schema [config] 选择了数据列,控制是否将与 schema 的匹配进行区分大小写。 | | start_snapshot_timestamp | long | 否 | - | 指示此扫描从表的最新快照开始查找更改,从给定的时间戳开始。
    timestamp – 自 Unix 纪元以来的时间戳(毫秒) | | start_snapshot_id | long | 否 | - | 指示此扫描从特定快照(独占)开始查找更改。 | | end_snapshot_id | long | 否 | - | 指示此扫描查找更改直到特定快照(包含)。 | | use_snapshot_id | long | 否 | - | 指示此扫描使用给定的快照 ID。 | | use_snapshot_timestamp | long | 否 | - | 指示此扫描使用给定时间(毫秒)的最新快照。timestamp – 自 Unix 纪元以来的时间戳(毫秒) | | stream_scan_strategy | enum | 否 | FROM_LATEST_SNAPSHOT | 流模式执行的启动策略,如果不指定任何值,默认使用 `FROM_LATEST_SNAPSHOT`,可选值为:
    TABLE_SCAN_THEN_INCREMENTAL:执行常规表扫描,然后切换到增量模式。
    FROM_LATEST_SNAPSHOT:从最新快照(包含)开始增量模式。
    FROM_EARLIEST_SNAPSHOT:从最早快照(包含)开始增量模式。
    FROM_SNAPSHOT_ID:从具有特定 id(包含)的快照开始增量模式。
    FROM_SNAPSHOT_TIMESTAMP:从具有特定时间戳(包含)的快照开始增量模式。 | | increment.scan-interval | long | 否 | 2000 | 增量扫描的间隔(毫秒) | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | | query | String | 否 | - | 用于选择 iceberg 数据的 select DML。它不能包含表名,也不支持别名。例如:`select * from table where f1 > 100`、`select fn from table where f1 > 100`。当前对 LIKE 语法的支持是有限的:LIKE 子句不应以 `%` 开头。支持的是:`select f1 from t where f2 like 'tom%' ` | ## 任务示例 ### 简单 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hadoop" warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" } namespace = "database1" table = "source" query = "select fn from table where f1 > 100" plugin_output = "iceberg" } } transform { } sink { Console { plugin_input = "iceberg" } } ``` ### 多表读取 ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config = { type = "hadoop" warehouse = "file:///tmp/seatunnel/iceberg/hadoop/" } namespace = "database1" table_list = [ { table = "table_1" }, { table = "table_2" query = "select fn from table where f1 > 100" } ] plugin_output = "iceberg" } } ``` ### Hadoop S3 目录 ```hocon source { iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ "type"="hadoop" "warehouse"="s3a://your_bucket/spark/warehouse/" } hadoop.config={ "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" "fs.s3a.endpoint" = "s3.cn-north-1.amazonaws.com.cn" "fs.s3a.access.key" = "xxxxxxxxxxxxxxxxx" "fs.s3a.secret.key" = "xxxxxxxxxxxxxxxxx" "fs.defaultFS" = "s3a://your_bucket" } namespace = "your_iceberg_database" table = "your_iceberg_table" plugin_output = "iceberg_test" } } ``` ### Hive 目录 ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hive" uri = "thrift://localhost:9083" warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/" } catalog_type = "hive" namespace = "your_iceberg_database" table = "your_iceberg_table" } } ``` ### 列投影 ```hocon source { Iceberg { catalog_name = "seatunnel" iceberg.catalog.config={ type = "hadoop" warehouse = "hdfs://your_cluster/tmp/seatunnel/iceberg/" } namespace = "your_iceberg_database" table = "your_iceberg_table" schema { fields { f2 = "boolean" f1 = "bigint" f3 = "int" f4 = "bigint" } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/InfluxDB.md ================================================ import ChangeLog from '../changelog/connector-influxdb.md'; # InfluxDB > InfluxDB 源连接器 ## 描述 通过 InfluxDB 读取外部数据源数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) 支持查询 SQL 并可以实现投影效果。 - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义 split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------------------|--------|----|-------|-------------------------------------------------------------------------------| | url | string | 是 | - | InfluxDB 连接 URL | | sql | string | 是 | - | 用于搜索数据的查询 SQL | | schema | config | 是 | - | 上游数据的模式信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | database | string | 是 | - | InfluxDB 数据库 | | username | string | 否 | - | InfluxDB 用户名 | | password | string | 否 | - | InfluxDB 密码 | | lower_bound | long | 否 | - | split_column 的下界 | | upper_bound | long | 否 | - | split_column 的上界 | | partition_num | int | 否 | - | 分区数量 | | split_column | string | 否 | - | 分割列 | | epoch | string | 否 | n | 返回的时间精度 | | connect_timeout_ms | long | 否 | 15000 | 连接 InfluxDB 的超时时间(毫秒) | | query_timeout_sec | int | 否 | 3 | 查询超时时间(秒) | | common-options | config | 否 | - | 源插件通用参数 | ### url 连接到 InfluxDB 的 URL,例如: ``` http://influxdb-host:8086 ``` ### sql [string] 用于搜索数据的查询 SQL ``` select name,age from test ``` ### schema [config] #### fields [Config] 上游数据的模式信息,例如: ``` schema { fields { name = string age = int } } ``` ### database [string] InfluxDB 数据库 ### username [string] InfluxDB 用户名 ### password [string] InfluxDB 密码 ### split_column [string] InfluxDB 的分割列 > 提示: > - InfluxDB tags 不支持作为分割主键,因为 tags 的类型只能是字符串 > - InfluxDB time 不支持作为分割主键,因为 time 字段无法参与数学计算 > - 目前,`split_column` 仅支持整数数据分割,不支持 `float`、`string`、`date` 等类型。 ### upper_bound [long] `split_column` 列的上界 ### lower_bound [long] `split_column` 列的下界 ``` 将 $split_column 范围分成 $partition_num 部分 如果 partition_num 为 1,使用整个 `split_column` 范围 如果 partition_num < (upper_bound - lower_bound),使用 (upper_bound - lower_bound) 个分区 例如:lower_bound = 1, upper_bound = 10, partition_num = 2 sql = "select * from test where age > 0 and age < 10" 分割结果 分割 1: select * from test where ($split_column >= 1 and $split_column < 6) and ( age > 0 and age < 10 ) 分割 2: select * from test where ($split_column >= 6 and $split_column < 11) and ( age > 0 and age < 10 ) ``` ### partition_num [int] InfluxDB 的分区数量 > 提示:确保 `upper_bound` 减去 `lower_bound` 能被 `partition_num` 整除,否则查询结果会重叠 ### epoch [string] 返回的时间精度 - 可选值:H, m, s, MS, u, n - 默认值:n ### query_timeout_sec [int] InfluxDB 的查询超时时间(秒) ### connect_timeout_ms [long] 连接到 InfluxDB 的超时时间(毫秒) ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 示例 多并行性和多分区扫描示例 ```hocon source { InfluxDB { url = "http://influxdb-host:8086" sql = "select label, value, rt, time from test" database = "test" upper_bound = 100 lower_bound = 1 partition_num = 4 split_column = "value" schema { fields { label = STRING value = INT rt = STRING time = BIGINT } } } } ``` 不使用分区扫描的示例 ```hocon source { InfluxDB { url = "http://influxdb-host:8086" sql = "select label, value, rt, time from test" database = "test" schema { fields { label = STRING value = INT rt = STRING time = BIGINT } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/IoTDB.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB 数据读取器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 用于从 IoTDB 中读取数据。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) > IoTDB 通过 SQL 查询支持列投影功能。 - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 支持的数据源信息 | 数据源 | 支持的版本 | 地址 | |-------|------------------------------|----------------| | IoTDB | `0.13.0 <= version <= 1.3.X` | localhost:6667 | ## 数据类型映射 | IoTDB 数据类型 | SeaTunnel 数据类型 | |------------|----------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | | STRING | STRING | | TIMESTAMP | BIGINT | | TIMESTAMP | TIMESTAMP | | BLOB | STRING | | DATE | DATE | ## Source 选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |----------------------------|---------|------|-----|----------------------------------------------------------------------------------| | node_urls | string | 是 | - | IoTDB 集群地址,格式为 `"host1:port"` 或 `"host1:port,host2:port"` | | username | string | 是 | - | IoTDB 用户名 | | password | string | 是 | - | IoTDB 用户密码 | | sql | string | 是 | - | 要执行的 SQL 查询语句 | | schema | config | 是 | - | 数据模式定义。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | fetch_size | int | 否 | - | 单次获取数据量:查询时每次从 IoTDB 获取的数据量 | | lower_bound | long | 否 | - | 时间范围下界(通过时间列进行数据分片时使用) | | upper_bound | long | 否 | - | 时间范围上界(通过时间列进行数据分片时使用) | | num_partitions | int | 否 | - | 分区数量(通过时间列进行数据分片时使用):
    - 1 个分区:使用完整时间范围
    - 若分区数 < (上界 -下界),则使用差值作为实际分区数 | | thrift_default_buffer_size | int | 否 | - | Thrift 协议缓冲区大小 | | thrift_max_frame_size | int | 否 | - | Thrift 最大帧尺寸 | | enable_cache_leader | boolean | 否 | - | 是否启用 Leader 节点缓存 | | version | string | 否 | - | 客户端 SQL 语义版本(`V_0_12` / `V_0_13`) | | common-options | | 否 | - | Source 插件常用参数,详见 [Source common Options](../Source common Options.md) | 我们可以使用时间列进行分区查询。 ### num_partitions [int] 分区数量 ### upper_bound [long] 时间范围上界 ### lower_bound [long] 时间范围下界 ``` 将时间范围分割成 numPartitions 个分区 若 numPartitions = 1,使用完整的时间范围 若 numPartitions < (upper_bound - lower_bound),使用 (upper_bound - lower_bound) 个分区 例:lower_bound = 1, upper_bound = 10, numPartitions = 2 sql = "select * from test where age > 0 and age < 10" 分区结果: split 1: select * from test where (time >= 1 and time < 6) and ( age > 0 and age < 10 ) split 2: select * from test where (time >= 6 and time < 11) and ( age > 0 and age < 10 ) ``` ## 示例 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = "localhost:6667" username = "root" password = "root" sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device" schema { fields { ts = timestamp device_name = string temperature = float moisture = bigint c_int = int c_bigint = bigint c_float = float c_double = double c_string = string c_boolean = boolean } } } } sink { Console { } } ``` 上游 IoTDB 的数据格式如下所示: ```shell IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device; +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ | Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ ``` 读取到 SeaTunnelRow 的数据格式如下所示: | ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean | |---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------| | 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true | | 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true | | 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true | ## 变更日志 ================================================ FILE: docs/zh/connectors/source/IoTDBv2.md ================================================ import ChangeLog from '../changelog/connector-iotdb.md'; # IoTDB > IoTDB 数据读取器 ## 支持引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 描述 用于从 IoTDB 中读取数据。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) > IoTDB 通过 SQL 查询支持列投影功能。 - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 支持的数据源信息 | 数据源 | 支持的版本 | 地址 | |-------|------------------|----------------| | IoTDB | `2.0 <= version` | localhost:6667 | ## 数据类型映射 | IoTDB 数据类型 | SeaTunnel 数据类型 | |------------|----------------| | BOOLEAN | BOOLEAN | | INT32 | TINYINT | | INT32 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | TEXT | STRING | | STRING | STRING | | TIMESTAMP | BIGINT | | TIMESTAMP | TIMESTAMP | | BLOB | STRING | | DATE | DATE | ## Source 选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |----------------------------|---------|------|------|----------------------------------------------------------------------------------| | node_urls | Array | 是 | - | IoTDB 集群地址,格式为 `["host1:port"]` 或 `["host1:port","host2:port"]` | | username | String | 是 | - | IoTDB 用户名 | | password | String | 是 | - | IoTDB 用户密码 | | sql_dialect | String | 否 | tree | IoTDB 模型,tree:树模型;table:表模型 | | database | String | 否 | - | 要查询的数据库名,只在表模型中生效 | | sql | String | 是 | - | 要执行的 SQL 查询语句 | | schema | Config | 是 | - | 数据模式定义。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | fetch_size | Integer | 否 | - | 单次获取数据量:查询时每次从 IoTDB 获取的数据量 | | lower_bound | Long | 否 | - | 时间范围下界(通过时间列进行数据分片时使用) | | upper_bound | Long | 否 | - | 时间范围上界(通过时间列进行数据分片时使用) | | num_partitions | Integer | 否 | - | 分区数量(通过时间列进行数据分片时使用):
    - 1 个分区:使用完整时间范围
    - 若分区数 < (上界 -下界),则使用差值作为实际分区数 | | default_thrift_buffer_size | Integer | 否 | - | Thrift 协议缓冲区大小 | | max_thrift_frame_size | Integer | 否 | - | Thrift 最大帧尺寸 | | enable_cache_leader | Boolean | 否 | - | 是否启用 Leader 节点缓存 | | common-options | | 否 | - | Source 插件常用参数,详见 [Source common Options](../Source common Options.md) | 我们可以使用时间列进行分区查询。 ### num_partitions [int] 分区数量 ### upper_bound [long] 时间范围上界 ### lower_bound [long] 时间范围下界 ``` 将时间范围分割成 numPartitions 个分区 若 numPartitions = 1,使用完整的时间范围 若 numPartitions < (upper_bound - lower_bound),使用 (upper_bound - lower_bound) 个分区 例:lower_bound = 1, upper_bound = 10, numPartitions = 2 sql = "select * from test where age > 0 and age < 10" 分区结果: split 1: select * from test where (time >= 1 and time < 6) and ( age > 0 and age < 10 ) split 2: select * from test where (time >= 6 and time < 11) and ( age > 0 and age < 10 ) ``` ## 示例 ### 示例 1:读取 IoTDB 树模型数据 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device" schema { fields { ts = timestamp device_name = string temperature = float moisture = bigint c_int = int c_bigint = bigint c_float = float c_double = double c_string = string c_boolean = boolean } } } } sink { Console { } } ``` 上游 IoTDB 的数据格式如下所示: ```shell IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device; +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ | Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ |2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true| |2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true| +------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+ ``` 读取到 SeaTunnelRow 的数据格式如下所示: | ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean | |---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------| | 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true | | 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true | | 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true | ### 示例 2:读取 IoTDB 表模型数据 ```hocon env { parallelism = 2 job.mode = "BATCH" } source { IoTDB { node_urls = ["localhost:6667"] username = "root" password = "root" sql_dialect = "table" database = "test_database" sql = "SELECT time, sn, type, bidprice, bidsize, domain, buyno, askprice FROM test_table" schema { fields { ts = timestamp sn = string type = string bidprice = int bidsize = double domain = boolean buyno = bigint askprice = string } } } } sink { Console { } } ``` > 若查询语句中明确了数据库,则无需使用 `database` 参数 上游 IoTDB 的数据格式如下所示: ```shell IoTDB> SELECT time, sn, type, bidprice, bidsize, domain, buyno, askprice FROM test_table +-----------------------------+------+----+--------+------------------+------+-----+-----------+ | time| sn|type|bidprice| bidsize|domain|buyno| askprice| +-----------------------------+------+----+--------+------------------+------+-----+-----------+ |2025-07-30T17:52:34.851+08:00|0700HK| L1| 9|10.323907796459721| true| 10|-1064754527| |2025-07-30T17:52:34.951+08:00|0700HK| L1| 10| 9.844574317657585| false| 9|-1088662576| |2025-07-30T17:52:35.051+08:00|0700HK| L1| 9| 9.272974132434069| true| 9| 402003616| +-----------------------------+------+----+--------+------------------+------+-----+-----------+ ``` 读取到 SeaTunnelRow 的数据格式如下所示: | ts | sn | type | bidprice | bidsize | domain | buyno | askprice | |-------------------------|--------|------|----------|--------------------|--------|-------|-------------| | 2025-07-30T17:52:34.851 | 0700HK | L1 | 9 | 10.323907796459721 | true | 10 | -1064754527 | | 2025-07-30T17:52:34.951 | 0700HK | L1 | 10 | 9.844574317657585 | false | 9 | -1088662576 | | 2025-07-30T17:52:35.051 | 0700HK | L1 | 9 | 9.272974132434069 | true | 9 | 402003616 | ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Jdbc.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # JDBC > JDBC 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 :::tip 警告:为了符合许可证要求,您必须自己提供数据库驱动程序,复制到 `$SEATUNNEL_HOME/lib/` 目录以使其工作。 例如,如果您使用 MySQL,应下载并复制 `mysql-connector-java-xxx.jar` 到 `$SEATUNNEL_HOME/lib/`。对于 Spark/Flink,您还应将其复制到 `$SPARK_HOME/jars/` 或 `$FLINK_HOME/lib/`。 ::: ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) 支持查询 SQL 并可以实现投影效果。 - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表读取](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------------------------------------------|---------|------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:postgresql://localhost/test | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,如果您使用 MySQL,值为 `com.mysql.cj.jdbc.Driver`。 | | username | String | 否 | - | 用户名 | | password | String | 否 | - | 密码 | | query | String | 否 | - | 查询语句 | | compatible_mode | String | 否 | - | 数据库的兼容模式,当数据库支持多种兼容模式时需要。
    例如,使用 OceanBase 数据库时,需要将其设置为 'mysql' 或 'oracle'。
    使用 starrocks 时,需要将其设置为 `starrocks` | | dialect | String | 否 | - | 指定的方言,如果不存在,仍然根据 url 获取,优先级高于 url。
    例如,使用 starrocks 时,需要将其设置为 `starrocks` | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒)。 | | partition_column | String | 否 | - | 用于分割数据的列名。 | | partition_upper_bound | Long | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_lower_bound | Long | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_num | Int | 否 | job parallelism | 不建议使用,正确的方法是通过 `split.size` 控制分割数量
    **注意:** 此参数仅在使用 `query` 参数时生效。使用 `table_path` 参数时不生效。 | | decimal_type_narrowing | Boolean | 否 | true | 十进制类型缩小,如果为 true,十进制类型将缩小为 int 或 long 类型(如果没有精度损失)。目前仅支持 Oracle。请参考下面的 `decimal_type_narrowing` | | int_type_narrowing | Boolean | 否 | true | Int 类型缩小,如果为 true,tinyint(1) 类型将缩小为布尔类型(如果没有精度损失)。目前支持 MySQL。请参考下面的 `int_type_narrowing` | | handle_blob_as_string | Boolean | 否 | false | 如果为 true,BLOB 类型将转换为 STRING 类型。**仅支持 Oracle 数据库**。这对于处理超过默认大小限制的 Oracle 中的大 BLOB 字段很有用。将 Oracle 的 BLOB 字段传输到 Doris 等系统时,将其设置为 true 可以使数据传输更高效。 | | use_select_count | Boolean | 否 | false | 在动态块分割阶段使用 select count 来获取表计数,而不是其他方法。这目前仅适用于 jdbc-oracle。在这种情况下,当使用 sql 从分析表更新统计信息更快时,直接使用 select count | | skip_analyze | Boolean | 否 | false | 在动态块分割阶段跳过表计数分析。这目前仅适用于 jdbc-oracle。在这种情况下,您定期安排分析表 sql 来更新相关表统计信息,或您的表数据不经常更改 | | use_regex | Boolean | 否 | false | 控制 table_path 的正则表达式匹配。设置为 `true` 时,table_path 将被视为正则表达式模式。设置为 `false` 或未指定时,table_path 将被视为精确路径(无正则表达式匹配)。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
    驱动程序的具体实现确定。例如,在 MySQL 中,properties 优先于 URL。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    `- mysql: "testdb.table1" `
    `- oracle: "test_schema.table1" `
    `- sqlserver: "testdb.test_schema.table1"`
    `- postgresql: "testdb.test_schema.table1"`
    `- iris: "test_schema.table1"` | | table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置代替 `table_path` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。例如 `where id > 100` | | split.size | Int | 否 | 8096 | 一个分割中有多少行,捕获的表在读取时被分成多个分割。**注意**:此参数仅在使用 `table_path` 参数时生效。使用 `query` 参数时不生效。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### 表匹配 JDBC 源连接器支持两种方式指定表: #### 注意事项 - 许多 JDBC 驱动会将 `DatabaseMetaData.getColumns(..., schemaPattern, tableNamePattern, ...)` 视为 SQL LIKE 的模式匹配。 当 schema/table 名称中包含 `_` 或 `%` 时,列发现可能会返回其他表的列。SeaTunnel 会按精确的 schema/table 标识符对返回结果做二次过滤, 以避免混入其他表的列。 - 对于大小写敏感的数据库,请确保配置的 schema/table 名称与数据库中实际标识符大小写一致。 1. **精确表路径**:使用 `table_path` 指定单个表及其完整路径。 ```hocon table_path = "testdb.table1" ``` 2. **正则表达式**:使用 `table_path` 与正则表达式模式匹配多个表。 ```hocon table_path = "testdb.table\\d+" # 匹配 table1, table2, table3 等 use_regex = true ``` #### 表名的正则表达式支持 JDBC 连接器支持使用正则表达式匹配多个表。此功能允许您使用单个源配置处理多个表。 #### 配置 要对表路径使用正则表达式匹配: 1. 设置 `use_regex = true` 以启用正则表达式匹配 2. 如果未设置 `use_regex` 或设置为 `false`,连接器将把 table_path 视为精确路径(无正则表达式匹配) #### 正则表达式语法注意事项 - **路径分隔符**:点 (`.`) 被视为数据库、模式和表名之间的分隔符。 - **转义点**:如果您需要在正则表达式中使用点 (`.`) 作为通配符来匹配任何字符,必须用反斜杠 (`\.`) 转义。 - **路径格式**:对于 `database.table` 或 `database.schema.table` 之类的路径,最后一个未转义的点将表模式与数据库/模式模式分开。 - **模式示例**: - `test.table\\d+` - 匹配 `test` 数据库中的 `table1`、`table2` 等表 - `test.*` - 匹配 `test` 数据库中的所有表(用于整个数据库同步) - `postgres.public.test_db_\.*` - 匹配 `postgres` 数据库的 `public` 模式中以 `test_db_` 开头的所有表 #### 示例 ```hocon source { Jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "password" table_list = [ { # 正则表达式匹配 - 匹配 test 数据库中的任何表 table_path = "test.*" use_regex = true }, { # 正则表达式匹配 - 匹配名称为 "user" 后跟数字的表 table_path = "test.user\\d+" use_regex = true }, { # 精确匹配 - 简单表名 table_path = "test.config" # use_regex 未指定,默认为 false }, ] } } ``` #### 多表同步 使用正则表达式时,连接器将从所有匹配的表中读取数据。每个表将被独立处理,数据将在输出中合并。 多表同步的示例配置: ```hocon Jdbc { url = "jdbc:mysql://localhost/test" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" # 使用显式配置的正则表达式 table_list = [ { table_path = "testdb.table\\d+" use_regex = true } ] } ``` ### decimal_type_narrowing 十进制类型缩小,如果为 true,十进制类型将缩小为 int 或 long 类型(如果没有精度损失)。目前仅支持 Oracle。 例如: decimal_type_narrowing = true | Oracle | SeaTunnel | |---------------|-----------| | NUMBER(1, 0) | Boolean | | NUMBER(6, 0) | INT | | NUMBER(10, 0) | BIGINT | decimal_type_narrowing = false | Oracle | SeaTunnel | |---------------|----------------| | NUMBER(1, 0) | Decimal(1, 0) | | NUMBER(6, 0) | Decimal(6, 0) | | NUMBER(10, 0) | Decimal(10, 0) | ### int_type_narrowing Int 类型缩小,如果为 true,tinyint(1) 类型将缩小为布尔类型(如果没有精度损失)。目前支持 MySQL。 例如: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ### dialect [string] 指定的方言,如果不存在,仍然根据 url 获取,优先级高于 url。例如,使用 starrocks 时,需要将其设置为 `starrocks`。类似地,使用 mysql 时,需要将其值设置为 `mysql`。 如果 SeaTunnel 不支持某个方言,它将使用默认方言 `GenericDialect`。只需确保您提供的驱动程序支持您想要连接的数据库。 #### 方言列表 | | 方言名称 | | |-----------|---------|----------| | Greenplum | DB2 | Dameng | | Gbase8a | HIVE | KingBase | | MySQL | StarRocks | Oracle | | Phoenix | Postgres | Redshift | | SapHana | Snowflake | Sqlite | | SqlServer | Tablestore | Teradata | | Vertica | OceanBase | XUGU | | IRIS | Inceptor | Highgo | ## 并行读取器 JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用某些规则分割表中的数据,这些数据将交给读取器进行读取。读取器的数量由 `parallelism` 选项确定。 **分割键规则:** 1. 如果 `partition_column` 不为 null,它将用于计算分割。该列必须在**支持的分割数据类型**中。 2. 如果 `partition_column` 为 null,seatunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多个列,将使用**支持的分割数据类型**中的第一列来分割数据。例如,表有主键(nn guid, name varchar),因为 `guid` 不在**支持的分割数据类型**中,所以列 `name` 将用于分割数据。 **支持的分割数据类型:** * String * Number(int, bigint, decimal, ...) * Date ## 提示 > 如果表无法分割(例如,表没有主键或唯一索引,且未设置 `partition_column`),它将以单并发运行。 > > 使用 `table_path` 替换 `query` 进行单表读取。如果需要读取多个表,请使用 `table_list`。 > 当基于 `query` 推断主键时,主键继承自结果集中第一列所在的底层表;如果 `query` 包含多表 JOIN 或同时从多张表读取,该主键对整个 JOIN 结果集的唯一性不作严格保证。 ## 附录 以上参数有一些参考值。 | 数据源 | 驱动 | URL | Maven | |-------------|---------------------------------------------------|--------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------| | mysql | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | postgresql | org.postgresql.Driver | jdbc:postgresql://localhost:5432/postgres | https://mvnrepository.com/artifact/org.postgresql/postgresql | | dm | dm.jdbc.driver.DmDriver | jdbc:dm://localhost:5236 | https://mvnrepository.com/artifact/com.dameng/DmJdbcDriver18 | | oracle | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@localhost:1521/xepdb1 | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | | sqlserver | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | | starrocks | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java | | kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar | | oceanbase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.12/oceanbase-client-2.4.12.jar | | hive | org.apache.hive.jdbc.HiveDriver | jdbc:hive2://localhost:10000 | https://repo1.maven.org/maven2/org/apache/hive/hive-jdbc/3.1.3/hive-jdbc-3.1.3-standalone.jar | ## 示例 ### 简单 #### 情况 1 ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" query = "select * from type_bin" } ``` #### 情况 2 在动态块分割阶段使用 select count(*) 代替分析表来计算表行数 ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" use_select_count = true query = "select * from type_bin" } ``` #### 情况 3 使用 select NUM_ROWS from all_tables 获取表行数但跳过分析表 ``` Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "root" password = "123456" skip_analyze = true query = "select * from type_bin" } ``` #### 情况 4 Oracle 源与 BLOB 作为字符串到 Doris Sink 此示例演示了在传输到 Doris 时如何将 Oracle 的 BLOB 数据作为字符串处理。这对于大型 BLOB 字段很有用。 ``` env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = oracle.jdbc.driver.OracleDriver url = "jdbc:oracle:thin:@oracle_host:1521/SERVICE_NAME" user = "username" password = "password" query = "SELECT ID, NAME, CONTENT_BLOB FROM MY_TABLE" handle_blob_as_string = true # 为 Oracle 启用 BLOB 到字符串转换 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Jira.md ================================================ import ChangeLog from '../changelog/connector-http-jira.md'; # Jira > Jira 源连接器 ## 描述 从 Jira 读取数据。 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------------------|---------|----------|---------------| | url | String | 是 | - | | email | String | 是 | - | | api_token | String | 是 | - | | method | String | 否 | get | | schema.fields | Config | 否 | - | | format | String | 否 | json | | params | Map | 否 | - | | body | String | 否 | - | | json_field | Config | 否 | - | | content_json | String | 否 | - | | poll_interval_millis | int | 否 | - | | retry | int | 否 | - | | retry_backoff_multiplier_ms | int | 否 | 100 | | retry_backoff_max_ms | int | 否 | 10000 | | enable_multi_lines | boolean | 否 | false | | common-options | config | 否 | - | ### url [String] http 请求 url ### email [String] Jira 邮件 ### api_token [String] Jira API 接口 https://id.atlassian.com/manage-profile/security/api-tokens ### method [String] http 请求方法。目前支持 'GET'和 'POST'。 ### params [Map] http 参数 ### body [String] http 请求体 ### poll_interval_millis [int] 流程下请求 API 的间隔时间(毫秒)。 ### retry [int] 请求失败 (`IOException`)时最大重试次数 ### retry_backoff_multiplier_ms [int] 重试退避时间倍数(毫秒)。 ### retry_backoff_max_ms [int] 重试退避最大时间(毫秒)。 ### format [String] 上游数据的格式,现在仅支持`json` `text`, 默认是 `json`. 若你的数据格式为 `json`, 需同时配置 schema 选项,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 您应该配置 schema 为以下内容: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 若你设置格式为 `text`,连接器不会对上游数据做出任何改变,示例: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 连接器将生成如下数据: | content | |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] 上游数据的字段定义。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 该参数可用于提取一些 json 数据。如果你只需要 “book” 部分的数据,可以配置 `content_field = "$.store.book.*"`. 如果你的返回数据如下所示: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 你可以配置 `content_field = "$.store.book.*"` 并且结果返回如下: ```json [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ] ``` 然后你可以通过更简单的 schema 配置获取所需的结果,例如: ```hocon Http { url = "http://mockserver:1080/contentjson/mock" method = "GET" format = "json" content_field = "$.store.book.*" schema = { fields { category = string author = string title = string price = string } } } ``` 示例: - 测试数据可参考此链接: [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置示例可参考此链接:[http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). ### json_field [Config] 该参数用于帮助你配置 schema,因此必须与 schema 一起使用。 如果你的数据如下所示: ```json { "store": { "book": [ { "category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95 }, { "category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99 } ], "bicycle": { "color": "red", "price": 19.95 } }, "expensive": 10 } ``` 你可以通过如下任务配置获取 “book” 部分的内容: ```hocon source { Http { url = "http://mockserver:1080/jsonpath/mock" method = "GET" format = "json" json_field = { category = "$.store.book[*].category" author = "$.store.book[*].author" title = "$.store.book[*].title" price = "$.store.book[*].price" } schema = { fields { category = string author = string title = string price = string } } } } ``` - 测试数据可参考此链接: [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - 任务配置示例可参考此链接: [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). ### 通用配置 源插件通用参数,请参考 [常用选项](../common-options/source-common-options.md) 获取详细说明 ## 示例 ```hocon Jira { url = "https://liugddx.atlassian.net/rest/api/3/search" email = "test@test.com" api_token = "xxx" schema { fields { expand = string startAt = bigint maxResults = int total = int } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Kafka.md ================================================ import ChangeLog from '../changelog/connector-kafka.md'; # Kafka > Kafka 源连接器 ## 支持以下引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义拆分](../../introduction/concepts/connector-v2-features.md) ## 描述 用于 Apache Kafka 的源连接器。 ## 支持的数据源信息 使用 Kafka 连接器需要以下依赖项。 可以通过 install-plugin.sh 下载或从 Maven 中央仓库获取。 | 数据源 | 支持的版本 | Maven 下载链接 | |-------|-------|-------------------------------------------------------------------------------| | Kafka | 通用版本 | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-kafka) | ## 源选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |-------------------------------------|-------------------------------------|------|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | topic | String | 是 | - | 使用表作为数据源时要读取数据的主题名称。它也支持通过逗号分隔的多个主题列表,例如 'topic-1,topic-2'。 | | table_list | Map | 否 | - | 主题列表配置,你可以同时配置一个 `table_list` 和一个 `topic`。 | | bootstrap.servers | String | 是 | - | 逗号分隔的 Kafka brokers 列表。 | | pattern | Boolean | 否 | false | 如果 `pattern` 设置为 `true`,则会使用指定的正则表达式匹配并订阅主题。 | | consumer.group | String | 否 | SeaTunnel-Consumer-Group | `Kafka 消费者组 ID`,用于区分不同的消费者组。 | | commit_on_checkpoint | Boolean | 否 | true | 如果为 true,消费者的偏移量将会定期在后台提交。 | | poll.timeout | Long | 否 | 10000 | kafka主动拉取时间间隔(毫秒)。 | | kafka.config | Map | 否 | - | 除了上述必要参数外,用户还可以指定多个非强制的消费者客户端参数,覆盖 [Kafka 官方文档](https://kafka.apache.org/documentation.html#consumerconfigs) 中指定的所有消费者参数。 | | schema | Config | 否 | - | 数据结构,包括字段名称和字段类型。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | format | String | 否 | json | 数据格式。默认格式为 json。可选格式包括 text, canal_json, debezium_json, ogg_json, maxwell_json, avro , protobuf和native。默认字段分隔符为 ", "。如果自定义分隔符,添加 "field_delimiter" 选项。如果使用 canal 格式,请参考 [canal-json](../formats/canal-json.md) 了解详细信息。如果使用 debezium 格式,请参考 [debezium-json](../formats/debezium-json.md)。一些Format的详细信息请参考 [formats](../formats) | | format_error_handle_way | String | 否 | fail | 数据格式错误的处理方式。默认值为 fail,可选值为 fail 和 skip。当选择 fail 时,数据格式错误将阻塞并抛出异常。当选择 skip 时,数据格式错误将跳过此行数据。 | | debezium_record_table_filter | Config | 否 | - | 用于过滤 debezium 格式的数据,仅当格式设置为 `debezium_json` 时使用。请参阅下面的 `debezium_record_table_filter` | | field_delimiter | String | 否 | , | 自定义数据格式的字段分隔符。 | | start_mode | StartMode[earliest],[group_offsets] | 否 | group_offsets | 消费者的初始消费模式。 | | start_mode.offsets | Config | 否 | - | 用于 specific_offsets 消费模式的偏移量。 | | start_mode.timestamp | Long | 否 | - | 用于 "timestamp" 消费模式的时间。 | | start_mode.end_timestamp | Long | 否 | - | 用于 "timestamp" 消费模式的结束时间,只支持批模式 | | partition-discovery.interval-millis | Long | 否 | -1 | 动态发现主题和分区的间隔时间。 | | ignore_no_leader_partition | Boolean | 否 | false | 是否忽略没有 leader 的分区。如果设置为 true,在分区发现过程中将跳过没有 leader 的分区。如果设置为 false(默认值),连接器将包含所有分区,无论 leader 状态如何。这在处理可能存在临时 leader 问题的 Kafka 集群时很有用。 | | common-options | | 否 | - | 源插件的常见参数,详情请参考 [Source Common Options](../common-options/source-common-options.md)。 | | protobuf_message_name | String | 否 | - | 当格式设置为 protobuf 时有效,指定消息名称。 | | protobuf_schema | String | 否 | - | 当格式设置为 protobuf 时有效,指定 Schema 定义。 | | strip_schema_registry_header | Boolean | 否 | false | 当格式设置为 protobuf 时有效。是否在 Protobuf 反序列化之前去除 Confluent Schema Registry 线格式头部(magic byte、schema id 和 message indexes)。当消费使用 Confluent Schema Registry 编码的 Protobuf 消息时,此选项非常有用。启用后,连接器将尝试在解析 Protobuf 消息之前检测并删除 Schema Registry 头部。如果未检测到头部,它将回退到标准的 Protobuf 反序列化。 | | reader_cache_queue_size | Integer | 否 | 1024 | Reader分片缓存队列,用于缓存分片对应的数据。占用大小取决于每个reader得到的分片量,而不是每个分片的数据量。 | | is_native | Boolean | No | false | 支持保留record的源信息。 | ### debezium_record_table_filter 我们可以使用 `debezium_record_table_filter` 来过滤 debezium 格式的数据。配置如下: ```hocon debezium_record_table_filter { database_name = "test" schema_name = "public" // null 如果不存在 table_name = "products" } ``` 只有 `test.public.products` 表的数据将被消费。 ## 元数据支持 Kafka 源会在 `ConsumerRecord.timestamp` 大于等于 0 时,将其自动写入 SeaTunnel 行的 `EventTime` 元数据。可以借助 [Metadata 转换](../../transforms/metadata.md) 把这段时间戳暴露为普通字段,方便做分区或下游 SQL 处理。 ```hocon source { Kafka { plugin_output = "kafka_raw" topic = "seatunnel_topic" bootstrap.servers = "localhost:9092" format = json } } transform { Metadata { plugin_input = "kafka_raw" plugin_output = "kafka_with_meta" metadata_fields { EventTime = kafka_ts # ConsumerRecord.timestamp (ms) } } Sql { plugin_input = "kafka_with_meta" plugin_output = "kafka_enriched" query = "select *, FROM_UNIXTIME(kafka_ts/1000, 'yyyy-MM-dd', 'Asia/Shanghai') as pt from kafka_with_meta where kafka_ts >= 0" } } ``` ## 任务示例 ### 简单示例 > 此示例读取 Kafka 的 topic_1、topic_2 和 topic_3 的数据并将其打印到客户端。如果尚未安装和部署 SeaTunnel,请按照 [安装指南](../../getting-started/locally/deployment.md) 进行安装和部署。然后,按照 [快速开始](../../getting-started/locally/quick-start-seatunnel-engine.md) 运行此任务。 ```hocon # 定义运行环境 env { parallelism = 2 job.mode = "BATCH" } source { Kafka { schema = { fields { name = "string" age = "int" } } format = text field_delimiter = "#" topic = "topic_1,topic_2,topic_3" bootstrap.servers = "localhost:9092" kafka.config = { client.id = client_1 max.poll.records = 500 auto.offset.reset = "earliest" enable.auto.commit = "false" } } } sink { Console {} } ``` ### 正则表达式主题 ```hocon source { Kafka { topic = ".*seatunnel*." pattern = "true" bootstrap.servers = "localhost:9092" consumer.group = "seatunnel_group" } } ``` ### AWS MSK SASL/SCRAM 将以下 `${username}` 和 `${password}` 替换为 AWS MSK 中的配置值。 ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "xx.amazonaws.com.cn:9096,xxx.amazonaws.com.cn:9096,xxxx.amazonaws.com.cn:9096" consumer.group = "seatunnel_group" kafka.config = { security.protocol=SASL_SSL sasl.mechanism=SCRAM-SHA-512 sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" } } } ``` ### AWS MSK IAM 从 [此处](https://github.com/aws/aws-msk-iam-auth/releases) 下载 `aws-msk-iam-auth-1.1.5.jar` 并将其放在 `$SEATUNNEL_HOME/plugin/kafka/lib` 目录下。 确保 IAM 策略中包含 `"kafka-cluster:Connect"` 权限,如下所示: ```hocon "Effect": "Allow", "Action": [ "kafka-cluster:Connect", "kafka-cluster:AlterCluster", "kafka-cluster:DescribeCluster" ], ``` 源配置示例: ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "xx.amazonaws.com.cn:9098,xxx.amazonaws.com.cn:9098,xxxx.amazonaws.com.cn:9098" consumer.group = "seatunnel_group" kafka.config = { security.protocol=SASL_SSL sasl.mechanism=AWS_MSK_IAM sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } } ``` ### Kerberos 认证示例 请在启动 SeaTunnel 之前设置 JVM 参数 `java.security.krb5.conf` 或更新 `/etc/krb5.conf` 中的默认 `krb5.conf`。 源配置示例: ```hocon source { Kafka { topic = "seatunnel" bootstrap.servers = "127.0.0.1:9092" consumer.group = "seatunnel_group" kafka.config = { security.protocol=SASL_PLAINTEXT sasl.kerberos.service.name=kafka sasl.mechanism=GSSAPI sasl.jaas.config="com.sun.security.auth.module.Krb5LoginModule required \n useKeyTab=true \n storeKey=true \n keyTab=\"/path/to/xxx.keytab\" \n principal=\"user@xxx.com\";" } } } ``` ### 多 Kafka 源示例 > 根据不同的 Kafka 主题和格式解析数据,并基于 ID 执行 upsert 操作。 > 注意: Kafka是一个非结构化数据源,应该使用`tables_configs`,将来会删除`table_list` ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafka_e2e:9092" tables_configs = [ { topic = "^test-ogg-sou.*" pattern = "true" consumer.group = "ogg_multi_group" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = ogg_json }, { topic = "test-cdc_mds" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } ] } } sink { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" user = test password = test generate_sink_sql = true database = test table = public.sink primary_keys = ["id"] } } ``` ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Kafka { bootstrap.servers = "kafka_e2e:9092" table_list = [ { topic = "^test-ogg-sou.*" pattern = "true" consumer.group = "ogg_multi_group" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = ogg_json }, { topic = "test-cdc_mds" start_mode = earliest schema = { fields { id = "int" name = "string" description = "string" weight = "string" } }, format = canal_json } ] } } sink { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" user = test password = test generate_sink_sql = true database = test table = public.sink primary_keys = ["id"] } } ``` ### Protobuf配置 `format` 设置为 `protobuf`,配置`protobuf`数据结构,`protobuf_message_name`和`protobuf_schema`参数 使用样例: ```hocon source { Kafka { topic = "test_protobuf_topic_fake_source" format = protobuf protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } ``` ### Protobuf with Schema Registry wire format 当消费使用 Confluent Schema Registry 编码的 Protobuf 消息时,您需要将 `strip_schema_registry_header` 设置为 `true`。连接器将自动检测并删除 Schema Registry 格式头部(magic byte、schema id 和 message indexes),然后再反序列化 Protobuf 消息。 使用样例: ```hocon source { Kafka { topic = "test_protobuf_schema_registry_topic" format = protobuf strip_schema_registry_header = true protobuf_message_name = Person protobuf_schema = """ syntax = "proto3"; package org.apache.seatunnel.format.protobuf; option java_outer_classname = "ProtobufE2E"; message Person { int32 c_int32 = 1; int64 c_int64 = 2; float c_float = 3; double c_double = 4; bool c_bool = 5; string c_string = 6; bytes c_bytes = 7; message Address { string street = 1; string city = 2; string state = 3; string zip = 4; } Address address = 8; map attributes = 9; repeated string phone_numbers = 10; } """ bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" plugin_output = "kafka_table" } } ``` **注意**:当启用 `strip_schema_registry_header` 时,连接器可以安全地处理 Schema Registry 编码的消息和纯 Protobuf 消息。如果未检测到 Schema Registry 头部,它将自动回退到标准 Protobuf 反序列化。 ``` ### 忽略无 Leader 分区 当处理可能存在临时 leader 问题的 Kafka 集群时,您可以配置连接器忽略没有 leader 的分区: ```hocon source { Kafka { topic = "test_topic" bootstrap.servers = "localhost:9092" consumer.group = "test_group" ignore_no_leader_partition = true start_mode = "earliest" } } ``` 当 `ignore_no_leader_partition = true` 时,连接器将在分区发现过程中跳过任何没有 leader 的分区,允许作业继续处理其他健康的分区。 ### format 如果需要保留Kafka原生的信息,可以参考如下配置。 配置示例: ```hocon source { Kafka { topic = "test_topic_native_source" bootstrap.servers = "kafkaCluster:9092" start_mode = "earliest" format_error_handle_way = skip format = "NATIVE" value_converter_schema_enabled = false consumer.group = "native_group" } } ``` 返回数据格式如下: ```json { "headers": { "header1": "header1", "header2": "header2" }, "key": "dGVzdF9ieXRlc19kYXRh", "partition": 3, "timestamp": 1672531200000, "timestampType": "CREATE_TIME", "value": "dGVzdF9ieXRlc19kYXRh" } ``` 注意:key/value是byte[]类型。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Kingbase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Kingbase > JDBC Kingbase 源连接器 ## 支持连接器版本 - 8.6 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [下载](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如:cp kingbase8-8.6.0.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | Kingbase 数据类型 | SeaTunnel 数据类型 | |------------------|------------------| | BOOL | BOOLEAN | | INT2 | SHORT | | SMALLSERIAL
    SERIAL
    INT4 | INT | | INT8
    BIGSERIAL | BIGINT | | FLOAT4 | FLOAT | | FLOAT8 | DOUBLE | | NUMERIC | DECIMAL | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT | STRING | | TIMESTAMP | LOCALDATETIME | | TIME | LOCALTIME | | DATE | LOCALDATE | | 其他数据类型 | 暂不支持 | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:kingbase8://localhost:54321/test | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,应为 `com.kingbase8.Driver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行性分割的列名,仅支持数值类型列和字符串类型列。 | | partition_lower_bound | BigDecimal | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分割数量,仅支持正整数。默认值是任务并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | use_regex | Boolean | 否 | false | 控制表路径的正则表达式匹配。当设置为true时,table_path 将被视为正则表达式模式。当设置为false或未指定时,table_path 将被视为精确路径(不进行正则匹配)。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    "testdb.table1" | | table_list | Array | 否 | - | 要读取的表的列表,您可以使用此配置代替 `table_path`,示例如下: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。例如 `where id > 100`。 | | split.size | Int | 否 | 8096 | 表的分割大小(行数),当读取表时,捕获的表会被分割成多个分片。 | | split.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 分片键分布因子的下限。该因子用于判断表数据的分布是否均匀。如果计算得到的分布因子大于或等于该下限(即,(MAX(id) - MIN(id) + 1) / 行数),则会对表的分片进行优化,以确保数据的均匀分布。反之,如果分布因子较低,则表数据将被视为分布不均匀。如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,则会采用基于采样的分片策略。默认值为 0.05。 | | split.even-distribution.factor.upper-bound | Double | 否 | 100 | 分片键分布因子的上限。该因子用于判断表数据的分布是否均匀。如果计算得到的分布因子小于或等于该上限(即,(MAX(id) - MIN(id) + 1) / 行数),则会对表的分片进行优化,以确保数据的均匀分布。反之,如果分布因子较大,则表数据将被视为分布不均匀,并且如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,则会采用基于采样的分片策略。默认值为 100.0。 | | split.sample-sharding.threshold | Int | 否 | 10000 | 此配置指定了触发样本分片策略的估算分片数阈值。当分布因子超出由 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估算的分片数量(计算方法为大致行数 / 分片大小)超过此阈值时,将使用样本分片策略。此配置有助于更高效地处理大型数据集。默认值为 1000 个分片。 | | split.inverse-sampling.rate | Int | 否 | 1000 | 样本分片策略中使用的采样率的倒数。例如,如果该值设置为 1000,则表示在采样过程中应用 1/1000 的采样率。此选项提供了灵活性,可以控制采样的粒度,从而影响最终的分片数量。特别适用于处理非常大的数据集,在这种情况下通常会选择较低的采样率。默认值为 1000。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### 提示 > 如果未设置 partition_column,它将以单并发运行,如果设置了 partition_column,它将根据任务的并发度并行执行。 ## 任务示例 ### 简单 ``` env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### 并行 > 使用您配置的分片字段和分片数据并行读取查询表。如果您想读取整个表,可以这样做 ``` source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } } ``` ### 并行边界 > 根据您配置的上下边界读取数据源更高效 ``` source { Jdbc { driver = "com.kingbase8.Driver" url = "jdbc:kingbase8://localhost:54321/db_test" username = "root" password = "" query = "select * from source" partition_column = "id" partition_num = 10 # 读取开始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Klaviyo.md ================================================ import ChangeLog from '../changelog/connector-http-klaviyo.md'; # Klaviyo > Klaviyo 源连接器 ## 描述 用于从 Klaviyo 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-----------------------------|---------|----|-------|------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | HTTP 请求 URL | | private_key | String | 是 | - | 用于登录的 API 私钥,您可以在此链接获取更多详情:https://developers.klaviyo.com/en/docs/authenticate_#private-key-authentication | | revision | String | 是 | - | API 端点版本(格式:YYYY-MM-DD) | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法 | | schema | Config | 否 | - | 上游数据的模式。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | json_field | Config | 否 | - | JSON 字段配置 | | content_json | String | 否 | - | 内容 JSON 字段 | | poll_interval_millis | int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | int | 否 | - | 如果 HTTP 请求返回 `IOException` 时的最大重试次数 | | retry_backoff_multiplier_ms | int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | boolean | 否 | false | 启用多行 | | common-options | config | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### url [String] HTTP 请求 URL ### private_key [String] 用于登录的 API 私钥,您可以在此链接获取更多详情: https://developers.klaviyo.com/en/docs/authenticate_#private-key-authentication ### revision [String] API 端点版本(格式:YYYY-MM-DD) ### method [String] HTTP 请求方法,仅支持 GET、POST 方法 ### params [Map] HTTP 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 HTTP API 的间隔(毫秒) ### retry [int] 如果 HTTP 请求返回 `IOException` 时的最大重试次数 ### retry_backoff_multiplier_ms [int] HTTP 请求失败时的重试退避倍数(毫秒) ### retry_backoff_max_ms [int] HTTP 请求失败时的最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 当您指定格式为 `json` 时,您还应该指定 schema 选项,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 您应该指定 schema 如下: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|------|---------| | 200 | get success | true | 当您指定格式为 `text` 时,连接器将对上游数据不做任何处理,例如: 上游数据如下: ```json { "code": 200, "data": "get success", "success": true } ``` 连接器将生成如下数据: | content | |---------| | {"code": 200, "data": "get success", "success": true} | ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 JSON 数据。如果您只需要 'book' 部分中的数据,请配置 `content_field = "$.store.book.*"`。 ### json_field [Config] 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 ### common options 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 示例 ```hocon Klaviyo { url = "https://a.klaviyo.com/api/lists/" private_key = "SeaTunnel-test" revision = "2020-10-17" method = "GET" format = "json" schema = { fields { type = string id = string attributes = { name = string created = string updated = string } links = { self = string } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Kudu.md ================================================ import ChangeLog from '../changelog/connector-kudu.md'; # Kudu > Kudu 源连接器 ## 支持 Kudu 版本 - 1.11.1/1.12.0/1.13.0/1.14.0/1.15.0 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 用于从 Kudu 读取数据。 测试的 kudu 版本是 1.11.1。 ## 数据类型映射 | Kudu 数据类型 | SeaTunnel 数据类型 | |-------------|------------------| | BOOL | BOOLEAN | | INT8
    INT16
    INT32 | INT | | INT64 | BIGINT | | DECIMAL | DECIMAL | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | STRING | STRING | | UNIXTIME_MICROS | TIMESTAMP | | BINARY | BYTES | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-------------------------------------------|--------|----|------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | kudu_masters | String | 是 | - | Kudu master 地址。用 ',' 分隔,例如 '192.168.88.110:7051'。 | | table_name | String | 是 | - | Kudu 表的名称。 | | client_worker_count | Int | 否 | 2 * Runtime.getRuntime().availableProcessors() | Kudu worker 数量。默认值是当前 CPU 核心数的两倍。 | | client_default_operation_timeout_ms | Long | 否 | 30000 | Kudu 普通操作超时时间。 | | client_default_admin_operation_timeout_ms | Long | 否 | 30000 | Kudu 管理操作超时时间。 | | enable_kerberos | Bool | 否 | false | Kerberos principal 启用。 | | kerberos_principal | String | 否 | - | Kerberos principal。注意所有 zeta 节点都需要有此文件。 | | kerberos_keytab | String | 否 | - | Kerberos keytab。注意所有 zeta 节点都需要有此文件。 | | kerberos_krb5conf | String | 否 | - | Kerberos krb5 conf。注意所有 zeta 节点都需要有此文件。 | | scan_token_query_timeout | Long | 否 | 30000 | 连接扫描令牌的超时时间。如果未设置,将与 operationTimeout 相同。 | | scan_token_batch_size_bytes | Int | 否 | 1024 * 1024 | Kudu 扫描字节数。一次读取的最大字节数,默认为 1MB。 | | use_regex | Bool | 否 | false | 控制 `table_name` 的正则匹配。当设置为 `true` 时,`table_name` 将被视为正则表达式模式,可以匹配多张表。当设置为 `false` 或未指定时,`table_name` 将被视为精确表名(不进行正则匹配)。 | | filter | String | 否 | - | Kudu 扫描过滤表达式,例如 id > 100 AND id < 200。 | | schema | Map | 否 | 1024 * 1024 | SeaTunnel Schema。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | table_list | Array | 否 | - | 要读取的表列表。您可以使用此配置代替 `table_name`,例如:```table_list = [{ table_name = "kudu_source_table_1"},{ table_name = "kudu_source_table_2"}] ```。也可以在每个 entry 中配置 `use_regex = true` 来对 `table_name` 启用正则匹配。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ## 任务示例 ### 简单 > 以下示例针对名为 "kudu_source_table" 的 Kudu 表,目标是在控制台打印此表中的数据并写入 kudu 表 "kudu_sink_table" ```hocon # 定义运行时环境 env { parallelism = 2 job.mode = "BATCH" } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** kudu { kudu_masters = "kudu-master:7051" table_name = "kudu_source_table" plugin_output = "kudu" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } transform { } sink { console { plugin_input = "kudu" } kudu { plugin_input = "kudu" kudu_masters = "kudu-master:7051" table_name = "kudu_sink_table" enable_kerberos = true kerberos_principal = "xx@xx.COM" kerberos_keytab = "xx.keytab" } } ``` ### 多表 ```hocon env { # 您可以在此处设置引擎配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** kudu{ kudu_masters = "kudu-master:7051" table_list = [ { table_name = "kudu_source_table_1" },{ table_name = "kudu_source_table_2" } ] plugin_output = "kudu" } } transform { } sink { Assert { rules { table-names = ["kudu_source_table_1", "kudu_source_table_2"] } } } ``` ### 使用正则表达式匹配表 Kudu Source 支持在 `table_name` 上使用正则表达式来匹配多张表(由于 Kudu 逻辑上只有一个 database,因此也可以用来实现“整库表”同步)。 #### 精确表名 使用 `table_name` 指定单个 Kudu 表的精确名称: ```hocon source { kudu { kudu_masters = "kudu-master:7051" table_name = "kudu_source_table_1" } } ``` #### 正则匹配 将 `table_name` 视为正则表达式,并开启 `use_regex`,即可用一条配置匹配多张表: ```hocon source { kudu { kudu_masters = "kudu-master:7051" # 匹配 kudu_source_table_1、kudu_source_table_2 等 table_name = "kudu_source_table_\\d+" use_regex = true } } ``` 也可以在 `table_list` 中组合精确表和正则表: ```hocon source { kudu { kudu_masters = "kudu-master:7051" table_list = [ { table_name = "kudu_source_table_1" }, { table_name = "kudu_source_table_2" }, { # 使用正则匹配,以 prefix_ 开头、以数字结尾的所有表 table_name = "prefix_\\d+" use_regex = true } ] } } ``` #### 整库匹配 如果当前 Kudu 实例中只有业务表,或者你希望“一次性同步所有表”,可以使用一个全匹配的正则: ```hocon source { kudu { kudu_masters = "kudu-master:7051" # 匹配当前 Kudu 实例中的所有表 table_name = ".*" use_regex = true } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Lemlist.md ================================================ import ChangeLog from '../changelog/connector-http-lemlist.md'; # Lemlist > Lemlist 源连接器 ## 描述 用于从 Lemlist 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | HTTP 请求 URL | | password | String | 是 | - | API 密钥用于登录 | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | json_field | Config | 否 | - | JSON 字段配置 | | content_json | String | 否 | - | 内容 JSON 配置 | | poll_interval_millis | int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | int | 否 | - | 如果 HTTP 请求返回 `IOException` 的最大重试次数 | | retry_backoff_multiplier_ms | int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | boolean | 否 | false | 是否启用多行模式 | | common-options | config | 否 | - | 源插件通用参数 | ### url [String] HTTP 请求 URL ### password [String] API 密钥用于登录,您可以在以下链接获取更多详情: https://app.lemlist.com/settings/integrations ### method [String] HTTP 请求方法,仅支持 GET、POST 方法 ### params [Map] HTTP 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 HTTP API 的间隔(毫秒) ### retry [int] 如果 HTTP 请求返回 `IOException` 的最大重试次数 ### retry_backoff_multiplier_ms [int] HTTP 请求失败时的重试退避倍数(毫秒) ### retry_backoff_max_ms [int] HTTP 请求失败时的最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 当您指定格式为 `json` 时,您还应该指定 schema 选项。 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 JSON 数据。如果您只需要 'book' 部分中的数据,配置 `content_field = "$.store.book.*"`。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/LocalFile.md ================================================ import ChangeLog from '../changelog/connector-file-local.md'; # LocalFile > 本地文件数据源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在 pollNext 调用中读取分片中的所有数据。读取的分片将保存在快照中。 - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 从本地文件系统读取数据。 :::tip 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已经集成了 hadoop。测试过的 hadoop 版本是 2.x。 如果您使用 SeaTunnel Engine,则在下载和安装 SeaTunnel Engine 时会自动集成 hadoop jar。您可以检查 `${SEATUNNEL_HOME}/lib` 下的 jar 包来确认这一点。 ::: ## 选项 | 名称 | 类型 | 是否必须 | 默认值 | |----------------------------|---------|------|---------------------| | path | string | 是 | - | | file_format_type | string | 是 | - | | read_columns | list | 否 | - | | delimiter/field_delimiter | string | 否 | \001 | | row_delimiter | string | 否 | \n | | parse_partition_from_path | boolean | 否 | true | | date_format | string | 否 | yyyy-MM-dd | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | | time_format | string | 否 | HH:mm:ss | | skip_header_row_number | long | 否 | 0 | | schema | config | 否 | - | | sheet_name | string | 否 | - | | excel_engine | string | 否 | POI | | xml_row_tag | string | 否 | - | | xml_use_attr_format | boolean | 否 | - | | csv_use_header_line | boolean | 否 | false | | file_filter_pattern | string | 否 | - | | filename_extension | string | 否 | - | | compress_codec | string | 否 | none | | archive_compress_codec | string | 否 | none | | encoding | string | 否 | UTF-8 | | null_format | string | 否 | - | | binary_chunk_size | int | 否 | 1024 | | binary_complete_file_mode | boolean | 否 | false | | sync_mode | string | 否 | full | | target_path | string | 否 | - | | target_hadoop_conf | map | 否 | - | | update_strategy | string | 否 | distcp | | compare_mode | string | 否 | len_mtime | | common-options | | 否 | - | | tables_configs | list | 否 | 用于定义多表任务 | | file_filter_modified_start | string | 否 | - | | file_filter_modified_end | string | 否 | - | | enable_file_split | boolean | 否 | false | | file_split_size | long | 否 | 134217728 | | quote_char | string | 否 | - | | escape_char | string | 否 | - | | metalake_type | string | 否 | gravitino | Metalake 服务类型,目前支持 `gravitino`。 | ### path [string] 源文件路径。 ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为 `json`,您还应该指定 schema 选项来告诉连接器如何将数据解析为您想要的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您也可以在一个文件中保存多条数据并用换行符分割: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` 您应该按如下方式指定 schema: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 如果您将文件类型指定为 `parquet` `orc`,则不需要 schema 选项,连接器可以自动找到上游数据的 schema。 如果您将文件类型指定为 `text` `csv`,您可以选择指定或不指定 schema 信息。 例如,上游数据如下: ```text tyrantlucifer#26#male ``` 如果您不指定数据 schema,连接器将把上游数据视为如下: | content | |-----------------------| | tyrantlucifer#26#male | 如果您指定数据 schema,除了 CSV 文件类型外,您还应该指定选项 `field_delimiter` 您应该按如下方式指定 schema 和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将生成如下数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | 如果您将文件类型指定为 `binary`,SeaTunnel 可以同步任何格式的文件, 例如压缩包、图片等。简而言之,任何文件都可以同步到目标位置。 在此要求下,您需要确保源和接收器同时使用 `binary` 格式进行文件同步。 您可以在下面的示例中找到具体用法。 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 ### read_columns [list] 数据源的读取列列表,用户可以使用它来实现字段投影。 ### delimiter/field_delimiter [string] **delimiter** 参数将在 2.3.5 版本后弃用,请使用 **field_delimiter** 代替。 仅在 file_format 为 text 时需要配置。 字段分隔符,用于告诉连接器如何分割字段。 默认 `\001`,与 hive 的默认分隔符相同 ### row_delimiter [string] 仅在 file_format 为 text 时需要配置。 行分隔符,用于告诉连接器如何分割行。 默认 `\n`。 ### parse_partition_from_path [boolean] 控制是否从文件路径解析分区键和值 例如,如果您从路径 `file://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` 读取文件 文件中的每条记录数据都将添加这两个字段: | name | age | |---------------|-----| | tyrantlucifer | 26 | 提示:**不要在 schema 选项中定义分区字段** ### date_format [string] 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式: `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` 默认 `yyyy-MM-dd` ### datetime_format [string] 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式: `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` 默认 `yyyy-MM-dd HH:mm:ss` ### time_format [string] 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式: `HH:mm:ss` `HH:mm:ss.SSS` 默认 `HH:mm:ss` ### skip_header_row_number [long] 跳过前几行,但仅适用于 txt 和 csv。 例如,设置如下: `skip_header_row_number = 2` 然后 SeaTunnel 将跳过源文件的前 2 行 ### schema [config] 仅在 file_format_type 为 text、json、excel、xml 或 csv(或其他我们无法从元数据读取 schema 的格式)时需要配置。 #### fields [Config] 上游数据的 schema 信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 ### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ### sheet_name [string] 仅在 file_format 为 excel 时需要配置。 读取工作簿的工作表。 ### excel_engine [string] 仅在 file_format 为 excel 时需要配置。 支持以下文件类型: `POI` `EasyExcel` 默认的 excel 读取引擎是 POI,但当读取超过 65,000 行的 Excel 时,POI 容易导致内存溢出,因此您可以切换到 EasyExcel 作为读取引擎。 ### xml_row_tag [string] 仅在 file_format 为 xml 时需要配置。 指定 XML 文件中数据行的标签名称。 ### xml_use_attr_format [boolean] 仅在 file_format 为 xml 时需要配置。 指定是否使用标签属性格式处理数据。 ### csv_use_header_line [boolean] 是否使用标题行解析文件,仅在 file_format 为 `csv` 且文件包含符合 RFC 4180 的标题行时使用 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考 https://en.wikipedia.org/wiki/Regular_expression。 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例 1**:*匹配所有 .txt 文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果是: ``` /data/seatunnel/20241001/report.txt ``` **示例 2**:*匹配所有以 abc 开头的文件*,正则表达式: ``` abc.* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例 3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例 4**:*匹配以 202410 开头的第三级文件夹和以 .csv 结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### filename_extension [string] 过滤文件扩展名,用于过滤具有特定扩展名的文件。示例:`csv` `.txt` `json` `.xml`。 ### compress_codec [string] 文件的压缩编解码器及其支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器及其支持的详细信息如下所示: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|--------------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz 压缩的 excel 文件需要压缩原始文件或指定文件后缀,例如 e2e.xls ->e2e_test.xls.gz ### encoding [string] 仅在 file_format_type 为 json,text,csv,xml 时使用。 要读取的文件的编码。此参数将由 `Charset.forName(encoding)` 解析。 ### null_format [string] 仅在 file_format_type 为 text 时使用。 null_format 定义哪些字符串可以表示为 null。 例如:`\N` ### binary_chunk_size [int] 仅在 file_format_type 为 binary 时使用。 读取二进制文件的块大小(以字节为单位)。默认为 1024 字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在 file_format_type 为 binary 时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为 false。 ### sync_mode [string] 文件同步模式,支持:`full`(默认)、`update`。 当 `update` 时,对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 **性能注意事项** - Update 模式会对每个源文件额外发起一次到目标端的 `getFileStatus` 用于对比。 - 不建议用于海量小文件场景。 **要求 / 限制** - `target_path` 通常应与 sink 的 `path` 一致(同一文件系统且相对路径结构一致)。 - 使用 `update_strategy=distcp` 时,依赖源/目标端时钟同步,否则可能误判。 - 使用 `compare_mode=checksum` 时,需要文件系统支持 checksum;若无法获取 checksum,SeaTunnel 会降级为内容比较(开销更大)并打印告警日志。 示例: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] 仅在 `sync_mode=update` 时使用。目标端基础路径(通常应与 sink 的 `path` 一致),用于对比同相对路径文件。 ### target_hadoop_conf [map] 仅在 `sync_mode=update` 时使用。目标端 Hadoop 配置(可选),可在其中设置 `fs.defaultFS` 覆盖目标 defaultFS。 ### update_strategy [string] 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)、`strict`。 ### compare_mode [string] 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)、`checksum`(仅在 `update_strategy=strict` 时可用)。 ### file_filter_modified_start 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### file_filter_modified_end 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss`。 ### enable_file_split [boolean] 开启文件分割功能,默认为false。文件类型为csv、text、json、parquet非压缩格式时可选择。 **使用建议** - 适合:读取少量大文件,并希望通过更高并行度提升吞吐。 - 不建议:读取大量小文件,或并行度较低的场景(拆分会带来额外的枚举/调度开销)。 **限制说明** - 不支持压缩文件(`compress_codec` != `none`)或归档文件(`archive_compress_codec` != `none`),会自动回退为不拆分。 - 对于 `text`/`csv`/`json`,实际 split 的大小可能略大于 `file_split_size`(因为需要对齐到下一个 `row_delimiter`)。 - LocalFile 内部使用 Hadoop LocalFileSystem(`file:///`),通常不需要额外 Hadoop 配置。 ### file_split_size [long] 文件分割大小,enable_file_split参数为true时可以填写。单位是字节数。默认值为128MB的字节数,即134217728。 **调优建议** - 建议从默认值(128MB)开始:如果并行度未充分利用可适当调小;如果 split 数量过多可适当调大。 - 经验公式:`file_split_size ≈ file_size / 期望并行度`。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### 通用选项 数据源插件通用参数,请参阅 [数据源通用选项](../common-options/source-common-options.md) 了解详情 ### tables_configs 用于定义多表任务,当您有多个表要读取时,可以使用此选项定义多个表。 ## 示例 ### 单表 ```hocon LocalFile { path = "/apps/hive/demo/student" file_format_type = "parquet" } ``` ```hocon LocalFile { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" } ``` 对于带有 `encoding` 的 json、text 或 csv 文件格式 ```hocon LocalFile { path = "/tmp/hive/warehouse/test2" file_format_type = "text" encoding = "gbk" } ``` ### 多表 ```hocon LocalFile { tables_configs = [ { schema { table = "student" } path = "/apps/hive/demo/student" file_format_type = "parquet" }, { schema { table = "teacher" } path = "/apps/hive/demo/teacher" file_format_type = "parquet" } ] } ``` ```hocon LocalFile { tables_configs = [ { schema { fields { name = string age = int } } path = "/apps/hive/demo/student" file_format_type = "json" }, { schema { fields { name = string age = int } } path = "/apps/hive/demo/teacher" file_format_type = "json" } } ``` ### 传输二进制文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" binary_chunk_size = 2048 binary_complete_file_mode = false } } sink { // 您可以将本地文件传输到 s3/hdfs/oss 等。 LocalFile { path = "/seatunnel/read/binary2/" file_format_type = "binary" } } ``` ### 增量同步(sync_mode=update,仅 binary) `sync_mode=update` 会对比 source 与 `target_path`,仅读取新增/变更文件。 多数情况下,`target_path` 需要与 sink 的 `path` 对齐(同一文件系统、相同相对路径)。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" sync_mode = "update" target_path = "/seatunnel/read/binary2/" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { LocalFile { path = "/seatunnel/read/binary2/" tmp_path = "/seatunnel/read/binary2-tmp/" file_format_type = "binary" } } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { LocalFile { path = "/data/seatunnel/" file_format_type = "csv" skip_header_row_number = 1 // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Maxcompute.md ================================================ import ChangeLog from '../changelog/connector-maxcompute.md'; # Maxcompute > Maxcompute 源连接器 ## 描述 用于从 Maxcompute 读取数据. ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |----------------|--------|----|---------------| | accessId | string | 是 | - | | accesskey | string | 是 | - | | endpoint | string | 是 | - | | project | string | 是 | - | | table_name | string | 是 | - | | partition_spec | string | 否 | - | | split_row | int | 否 | 10000 | | read_columns | Array | 否 | - | | table_list | Array | 否 | - | | common-options | string | 否 | | | schema | config | 否 | | ### accessId [string] `accessId` 您的 Maxcompute 密钥 Id, 可以从阿里云访问哪个云. ### accesskey [string] `accesskey` Your Maxcompute 密钥, 可以从阿里云访问哪个云. ### endpoint [string] `endpoint` 您的 Maxcompute 端点以 http 开头. ### project [string] `project` 您在阿里云中创建的Maxcompute项目. ### table_name [string] `table_name` 目标Maxcompute表名,例如:fake. ### partition_spec [string] `partition_spec` Maxcompute分区表的此规范,例如:ds='20220101'. ### split_row [int] `split_row` 每次拆分的行数,默认值: 10000. ### read_columns [Array] `read_columns` 要读取的列,如果未设置,则将读取所有列。例如. ["col1", "col2"] ### table_list [Array] 要读取的表列表,您可以使用此配置代替 `table_name`. ### common options 源插件常用参数, 详见 [源通用选项](../common-options/source-common-options.md) . ## 示例 ### 表读取 ```hocon source { Maxcompute { accessId="" accesskey="" endpoint="" project="" table_name="" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] } } ``` ### 使用表列表读取 ```hocon source { Maxcompute { accessId="" accesskey="" endpoint="" project="" # default project table_list = [ { table_name = "test_table" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] }, { project = "test_project" table_name = "test_table2" #partition_spec="" #split_row = 10000 #read_columns = ["col1", "col2"] } ] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Milvus.md ================================================ import ChangeLog from '../changelog/connector-milvus.md'; # Milvus > Milvus 源连接器 ## 描述 这个Milvus源连接器从Milvus或Zilliz Cloud读取数据,它具有以下功能: - 支持按分区读写数据 - 支持将动态模式数据读入元数据列 - json数据将转换为json字符串,并将sink转换为json - 自动重试以绕过速率限制和grpc限制 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) ## 数据类型映射 | Milvus 数据类型 | SeaTunnel 数据类型 | |---------------------|---------------------| | INT8 | TINYINT | | INT16 | SMALLINT | | INT32 | INT | | INT64 | BIGINT | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOL | BOOLEAN | | JSON | STRING | | ARRAY | ARRAY | | VARCHAR | STRING | | FLOAT_VECTOR | FLOAT_VECTOR | | BINARY_VECTOR | BINARY_VECTOR | | FLOAT16_VECTOR | FLOAT16_VECTOR | | BFLOAT16_VECTOR | BFLOAT16_VECTOR | | SPARSE_FLOAT_VECTOR | SPARSE_FLOAT_VECTOR | ## 源选项 | 名称 | 类型 | 必需 | 默认值 | 描述 | |------------|--------|----------|---------|--------------------------------------------------------------------------------------------| | url | String | 是 | - | 连接到Milvus或Zilliz Cloud的URL. | | token | String | 是 | - | 用户:密码 | | database | String | 是 | default | 从哪个数据库读取数据. | | collection | String | 否 | - | 如果设置,将只读取一个集合,否则将读取数据库下的所有集合. | ## 任务示例 ```bash source { Milvus { url = "http://127.0.0.1:19530" token = "username:password" database = "default" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/MongoDB-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-mongodb.md'; # MongoDB CDC > MongoDB CDC 源连接器 ## 支持这些引擎 > SeaTunnel Zeta
    > Flink
    ## 关键特性 - [ ] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 MongoDB CDC连接器允许从MongoDB数据库读取快照数据和增量数据。 ## 支持的数据源信息 为了使用Mongodb CDC连接器,需要以下依赖关系。 它们可以通过install-plugin.sh或Maven中央存储库下载。 | 数据源 | 支持的版本 | Dependency | |------------|--------------------|-------------------------------------------------------------------------------------------| | MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-cdc-mongodb) | ## 可用性设置 1.MongoDB版本:MongoDB版本>=4.0。 2.集群部署:副本集或分片集群。 3.存储引擎:WiredTiger存储引擎。 4.权限:更改流和读取 ``` // 1) 切换到目标数据库 use // 2) 创建角色(CDC 场景常用权限) db.createRole({ role: "", privileges: [ { resource: { db: "", collection: "" }, actions: [ "collStats", "splitVector", "listDatabases", "find", "listCollections", "changeStream" ] } ], roles: [] }) // 3) 创建用户,并绑定 read + 自定义角色 db.createUser({ user: "", pwd: "", roles: [ { role: "read", db: "" }, { role: "", db: "" } ] }) // 4) 为用户追加授予角色(用户已存在或需要补授权时使用) db.grantRolesToUser("", [""]) ``` ## 数据类型映射 下表列出了从MongoDB BSON类型到Seatunnel数据类型的字段数据类型映射。 | MongoDB BSON Type | SeaTunnel 数据类型 | |-------------------|---------------------| | ObjectId | STRING | | String | STRING | | Boolean | BOOLEAN | | Binary | BINARY | | Int32 | INTEGER | | Int64 | BIGINT | | Double | DOUBLE | | Decimal128 | DECIMAL | | Date | DATE | | Timestamp | TIMESTAMP | | Object | ROW | | Array | ARRAY | 对于MongoDB中的特定类型,我们使用扩展JSON格式将其映射到Seatunnel STRING类型。 | MongoDB BSON type | SeaTunnel STRING | |-------------------|----------------------------------------------------------------------------------------------| | Symbol | {"_value": {"$symbol": "12"}} | | RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | | JavaScript | {"_value": {"$code": "function() { return 10; }"}} | | DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | **提示** > 1.在SeaTunnel中使用DECIMAL类型时,请注意最大范围不能超过34位数字,这意味着您应该使用DECIMAL(34,18)。
    ## 源配置项 | Name | 类型 | 必须 | 默认值 | 描述 | |------------------------------------|--------|----------|-------|---------------------------------------------------------------------------------------| | hosts | String | 是 | - | MongoDB服务器的主机名和端口对的逗号分隔列表。如 `localhost:27017,localhost:27018` | | username | String | 否 | - | 连接到MongoDB时要使用的数据库用户的名称。 | | password | String | 否 | - | 连接到MongoDB时使用的密码。 | | database | List | 是 | - | 要监视更改的数据库的名称。如果未设置,则将捕获所有数据库。该数据库还支持正则表达式,以监视与正则表达式匹配的多个数据库。例如db1、db2。 | | collection | List | 是 | - | 要监视更改的数据库中集合的名称。如果未设置,则将捕获所有集合。该集合还支持正则表达式来监视与完全限定的集合标识符匹配的多个集合。例如db1.coll1、db2.coll2。 | | schema | | 否 | - | 数据的结构,包括字段名和字段类型,使用单表cdc。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | tables_configs | | 否 | - | 数据的结构,包括字段名和字段类型,使用多表cdc。 | | connection.options | String | 否 | - | 与号分隔了MongoDB的连接选项。如。 `replicaSet=test&connectTimeoutMS=300000`. | | batch.size | Long | 否 | 1024 | 批量大小。 | | poll.max.batch.size | Enum | 否 | 1024 | 轮询新数据时,单个批中包含的更改流文档的最大数量。 | | poll.await.time.ms | Long | 否 | 1000 | 在检查更改流上的新结果之前等待的时间量。 | | heartbeat.interval.ms | String | 否 | 0 | 发送心跳消息之间的时间长度(毫秒)。使用0禁用。 | | incremental.snapshot.chunk.size.mb | Long | 否 | 64 | 增量快照的块大小(mb)。 | | exactly_once | Boolean| 否 | false | 启用精确一次语义,若开启在大表快照阶段恢复时会有内存溢出风险。 | | common-options | | 否 | - | 源插件常用参数,请参考 [Source Common Options](../common-options/source-common-options.md) | ### 提示 > 1.如果集合更改速度较慢,强烈建议为heartbeat.interval.ms参数设置一个大于0的适当值。当我们从检查点或保存点恢复Seatunnel作业时,心跳事件可以向前推resumeToken以避免其过期。
    > 2.MongoDB对单个文档的限制为16MB。变更文档包含其他信息,因此即使原始文档不超过15MB,变更文档也可能超过16MB的限制,从而导致变更流操作终止。
    > 3.建议使用不可变分片键。在MongoDB中,分片键允许在启用事务后进行修改,但更改分片键可能会导致频繁的分片迁移,从而导致额外的性能开销。此外,修改分片键也可能导致更新查找功能失效,从而导致CDC(变更数据捕获)场景中的结果不一致。
    > 4.“schema”和“tables_config”是互斥的,必须一次配置一个。 ## 更新数据的流 [**更新流**](https://www.mongodb.com/docs/v5.0/changeStreams/) 是MongoDB 3.6为副本集和分片集群提供的一项新功能,允许应用程序访问实时数据更改,而不会出现尾随oplog的复杂性和风险。 应用程序可以使用更改流订阅单个集合、数据库或整个部署上的所有数据更改,并立即对其做出反应。 **查找更新操作的完整文档**是**更改流**提供的一项功能,它可以配置更改流以返回更新文档的最新多数提交版本。由于此功能,我们可以轻松收集最新的完整文档,并将更改日志转换为Changelog流。 更新流中删除事件捕获的数据格式:[delete envet](https://www.mongodb.com/docs/v5.0/reference/change-events/delete/) ``` { "_id": { }, "operationType": "delete", "clusterTime": , "ns": { "db": "engineering", "coll": "users" }, "documentKey": { "_id": ObjectId("599af247bb69cd89961c986d") } } ``` 由于在更新流游标向客户端发送删除事件时文档已不存在,因此省略了完整文档。 ## 如何创建MongoDB CDC数据同步作业 ### CDC数据打印到客户端 以下示例演示了如何创建数据同步作业,该作业从MongoDB读取cdc数据并将其打印到本地客户端: ```hocon env { # 您可以在此处设置engine配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products"] username = stuser password = stpw schema = { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } } } # 控制台打印读取的Mongodb数据 sink { Console { parallelism = 1 } } ``` ## CDC数据写入MysqlDB 以下示例演示了如何创建数据同步作业,该作业从MongoDB读取cdc数据并写入mysql数据库: ```hocon env { # 您可以在此处设置engine配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products"] username = stuser password = stpw schema = { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306" driver = "com.mysql.cj.jdbc.Driver" user = "st_user" password = "seatunnel" generate_sink_sql = true # 您需要同时配置数据库和表 database = mongodb_cdc table = products primary_keys = ["_id"] } } ``` ## 多表同步 以下示例演示了如何创建数据同步作业,该作业读取多个库表mongodb的cdc数据并将其打印到本地客户端: ```hocon env { # 您可以在此处设置engine配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products", "inventory.orders"] username = superuser password = superpw tables_configs = [ { schema { table = "inventory.products" fields { "_id" : string, "name" : string, "description" : string, "weight" : string } } }, { schema { table = "inventory.orders" fields { "_id" : string, "order_number" : int, "order_date" : string, "quantity" : int, "product_id" : string } } } ] } } # 控制台打印读取的Mongodb数据 sink { Console { } } ``` ## 实时流数据格式 ```shell { _id : { }, // Identifier of the open change stream, can be assigned to the 'resumeAfter' parameter for subsequent resumption of this change stream "operationType" : "", // The type of change operation that occurred, such as: insert, delete, update, etc. "fullDocument" : { }, // The full document data involved in the change operation. This field does not exist in delete operations "ns" : { "db" : "", // The database where the change operation occurred "coll" : "" // The collection where the change operation occurred }, "to" : { // These fields are displayed only when the operation type is 'rename' "db" : "", // The new database name after the change "coll" : "" // The new collection name after the change }, "source":{ "ts_ms":"", // The timestamp when the change operation occurred "table":"" // The collection where the change operation occurred "db":"", // The database where the change operation occurred "snapshot":"false" // Identify the current stage of data synchronization }, "documentKey" : { "_id" : }, // The _id field value of the document involved in the change operation "updateDescription" : { // Description of the update operation "updatedFields" : { }, // The fields and values that the update operation modified "removedFields" : [ "", ... ] // The fields and values that the update operation removed } "clusterTime" : , // The timestamp of the Oplog log entry corresponding to the change operation "txnNumber" : , // If the change operation is executed in a multi-document transaction, this field and value are displayed, representing the transaction number "lsid" : { // Represents information related to the Session in which the transaction is located "id" : , "uid" : } } ``` ## 修改日志 ================================================ FILE: docs/zh/connectors/source/MongoDB.md ================================================ import ChangeLog from '../changelog/connector-mongodb.md'; # MongoDB > MongoDB 源连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 MongoDB连接器提供了从MongoDB读取数据和向MongoDB写入数据的能力。 本文档描述了如何设置MongoDB连接器以对MongoDB运行数据读取。 ## 支持的数据源信息 为了使用Mongodb连接器,需要以下依赖关系。 它们可以通过install-plugin.sh或Maven中央存储库下载。 | 数据源 | 支持的版本 | 依赖 | |------------|--------------------|---------------------------------------------------------------------------------------| | MongoDB | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-mongodb) | ## 数据类型映射 下表列出了从MongoDB BSON类型到SeaTunnel数据类型的字段数据类型映射。 | MongoDB BSON type | SeaTunnel 数据类型 | |-------------------|----------------| | ObjectId | STRING | | String | STRING | | Boolean | BOOLEAN | | Binary | BINARY | | Int32 | INTEGER | | Int64 | BIGINT | | Double | DOUBLE | | Decimal128 | DECIMAL | | Date | Date | | Timestamp | Timestamp | | Object | ROW | | Array | ARRAY | 对于MongoDB中的特定类型,我们使用扩展JSON格式将其映射到SeaTunnel STRING类型。 | MongoDB BSON type | SeaTunnel STRING | |-------------------|----------------------------------------------------------------------------------------------| | Symbol | {"_value": {"$symbol": "12"}} | | RegularExpression | {"_value": {"$regularExpression": {"pattern": "^9$", "options": "i"}}} | | JavaScript | {"_value": {"$code": "function() { return 10; }"}} | | DbPointer | {"_value": {"$dbPointer": {"$ref": "db.coll", "$id": {"$oid": "63932a00da01604af329e33c"}}}} | **提示** > 1.在SeaTunnel中使用DECIMAL类型时,请注意最大范围不能超过34位数字,这意味着您应该使用DECIMAL(34,18)。
    ## 源配置项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |----------------------|---------|----|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | uri | String | 是 | - | MongoDB标准连接uri。例如 mongodb://user:password@hosts:27017/database?readPreference=secondary&slaveOk=true. | | database | String | 是 | - | 要读取或写入的MongoDB数据库的名称。 | | collection | String | 是 | - | 要读取或写入的MongoDB集合的名称。 | | schema | String | 是 | - | MongoDB的BSON和seatunnel数据结构映射。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | match.query | String | 否 | - | 在MongoDB中,过滤器用于过滤查询操作的文档。 | | match.projection | String | 否 | - | 在MongoDB中,投影用于控制查询结果中包含的字段。 | | partition.split-key | String | 否 | _id | 分片字段。 | | partition.split-size | Long | 否 | 64 * 1024 * 1024 | 分片大小。 | | cursor.no-timeout | Boolean | 否 | true | MongoDB服务器通常在非活动期(10分钟)后超时空闲游标,以防止过度使用内存。将此选项设置为true以防止这种情况发生。但是,如果应用程序处理当前一批文档的时间超过30分钟,则会话将标记为已过期并关闭。 | | fetch.size | Int | 否 | 2048 | 设置每批从服务器获取的文档数量。设置适当的批大小可以提高查询性能,避免一次获取大量数据造成的内存压力。 | | max.time-min | Long | 否 | 10 | 此参数是一个MongoDB查询选项,用于限制查询操作的最大执行时间。maxTimeMin的值以分钟为单位。如果查询的执行时间超过指定的时间限制,MongoDB将终止操作并返回错误。 | | flat.sync-string | Boolean | 否 | true | 通过使用flatSyncString,只能设置一个字段属性值,字段类型必须是String。此操作将对单个MongoDB数据条目执行字符串映射。 | | common-options | | 否 | - | 源插件常用参数,请参考 [源通用选项](../common-options/source-common-options.md) | ### 提示 > 1.参数`match.query `与历史旧版本参数`matchQuery `兼容,它们是等效的替换。
    ## 如何创建MongoDB数据同步作业 以下示例演示了如何创建数据同步作业,该作业从MongoDB读取数据并将其打印到本地客户端: ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建MongoDB源 source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "source_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp } } } } } # 控制台打印读取的Mongodb数据 sink { Console { parallelism = 1 } } ``` ## 参数说明 ### MongoDB数据库连接URI示例 未经身份验证的单节点连接: ```bash mongodb://192.168.0.100:27017/mydb ``` 副本集连接: ```bash mongodb://192.168.0.100:27017/mydb?replicaSet=xxx ``` 经过身份验证的副本集连接: ```bash mongodb://admin:password@192.168.0.100:27017/mydb?replicaSet=xxx&authSource=admin ``` 多节点副本集连接: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb?replicaSet=xxx ``` 分片集群连接: ```bash mongodb://192.168.0.100:27017/mydb ``` 多个mongos连接: ```bash mongodb://192.168.0.1:27017,192.168.0.2:27017,192.168.0.3:27017/mydb ``` 注意:URI中的用户名和密码在连接到连接字符串之前必须进行URL编码。 ### 匹配查询扫描 在数据同步场景中,需要尽早使用matchQuery方法来减少后续操作员需要处理的文档数量,从而提高性能。 下面是一个使用`match.query的seatunnel的简单示例` ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "orders" match.query = "{status: \"A\"}" schema = { fields { id = bigint status = string } } } } ``` 以下是各种数据类型的MatchQuery查询语句的示例: ```bash # Query Boolean type "{c_boolean:true}" # Query string type "{c_string:\"OCzCj\"}" # Query the integer "{c_int:2}" # Type of query time "{c_date:ISODate(\"2023-06-26T16:00:00.000Z\")}" # Query floating point type {c_double:{$gte:1.71763202185342e+308}} ``` 请参阅如何编写`match.query的语法`:https://www.mongodb.com/docs/manual/tutorial/query-documents ### 投影扫描 在MongoDB中,Projection用于控制查询结果中包含哪些字段。这可以通过指定哪些字段需要返回,哪些字段不需要返回来实现。 在find()方法中,投影对象可以作为第二个参数传递。投影对象的键表示要包含或排除的字段,值1表示包含,0表示排除。 这里有一个简单的例子,假设我们有一个名为users的集合: ```bash # Returns only the name and email fields db.users.find({}, { name: 1, email: 0 }); ``` 在数据同步场景中,需要尽早使用投影来减少后续操作员需要处理的文档数量,从而提高性能。 以下是一个使用投影的seatunnel的简单示例: ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" match.projection = "{ name: 1, email: 0 }" schema = { fields { name = string } } } } ``` ### 分区扫描 为了加快并行源任务实例中的数据读取速度,seatunnel为MongoDB集合提供了分区扫描功能。提供了以下分区策略。 用户可以通过设置用于分片字段的partition.split-key和用于分片大小的partition.split-size来控制数据分片。 ```bash source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" partition.split-key = "id" partition.split-size = 1024 schema = { fields { id = bigint status = string } } } } ``` ### Flat Sync String 通过使用“flat.sync string”,只能设置一个字段属性值,并且字段类型必须是string。 此操作将对单个MongoDB数据条目执行字符串映射。 ```bash env { parallelism = 10 job.mode = "BATCH" } source { MongoDB { uri = "mongodb://user:password@127.0.0.1:27017" database = "test_db" collection = "users" flat.sync-string = true schema = { fields { data = string } } } } sink { Console {} } ``` 使用与修改后的参数同步的数据样本,例如: ```json { "_id":{ "$oid":"643d41f5fdc6a52e90e59cbf" }, "c_map":{ "OQBqH":"jllt", "rkvlO":"pbfdf", "pCMEX":"hczrdtve", "DAgdj":"t", "dsJag":"voo" }, "c_array":[ { "$numberInt":"-865590937" }, { "$numberInt":"833905600" }, { "$numberInt":"-1104586446" }, { "$numberInt":"2076336780" }, { "$numberInt":"-1028688944" } ], "c_string":"bddkzxr", "c_boolean":false, "c_tinyint":{ "$numberInt":"39" }, "c_smallint":{ "$numberInt":"23672" }, "c_int":{ "$numberInt":"-495763561" }, "c_bigint":{ "$numberLong":"3768307617923954543" }, "c_float":{ "$numberDouble":"5.284220288280258E37" }, "c_double":{ "$numberDouble":"1.1706091642478246E308" }, "c_bytes":{ "$binary":{ "base64":"ZWJ4", "subType":"00" } }, "c_date":{ "$date":{ "$numberLong":"1686614400000" } }, "c_decimal":{ "$numberDecimal":"683265300" }, "c_timestamp":{ "$date":{ "$numberLong":"1684283772000" } }, "c_row":{ "c_map":{ "OQBqH":"cbrzhsktmm", "rkvlO":"qtaov", "pCMEX":"tuq", "DAgdj":"jzop", "dsJag":"vwqyxtt" }, "c_array":[ { "$numberInt":"1733526799" }, { "$numberInt":"-971483501" }, { "$numberInt":"-1716160960" }, { "$numberInt":"-919976360" }, { "$numberInt":"727499700" } ], "c_string":"oboislr", "c_boolean":true, "c_tinyint":{ "$numberInt":"-66" }, "c_smallint":{ "$numberInt":"1308" }, "c_int":{ "$numberInt":"-1573886733" }, "c_bigint":{ "$numberLong":"4877994302999518682" }, "c_float":{ "$numberDouble":"1.5353209063652051E38" }, "c_double":{ "$numberDouble":"1.1952441956458565E308" }, "c_bytes":{ "$binary":{ "base64":"cWx5Ymp0Yw==", "subType":"00" } }, "c_date":{ "$date":{ "$numberLong":"1686614400000" } }, "c_decimal":{ "$numberDecimal":"656406177" }, "c_timestamp":{ "$date":{ "$numberLong":"1684283772000" } } }, "id":{ "$numberInt":"2" } } ``` ## 修改日志 ================================================ FILE: docs/zh/connectors/source/MyHours.md ================================================ import ChangeLog from '../changelog/connector-http-myhours.md'; # My Hours > My Hours 源连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 用于从 My Hours 读取数据。 ## 支持的数据源信息 为了使用 My Hours 连接器,需要以下依赖项。 可以通过 install-plugin.sh 或从 Maven 中央存储库下载。 | 数据源 | 支持的版本 | 依赖 | |--------|-----------|------| | My Hours | universal | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel) | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-----------------------------|---------|----|-------|---------------------------------------------------------------------------------------------| | url | String | 是 | - | HTTP 请求 URL | | email | String | 是 | - | My Hours 登录电子邮件地址 | | password | String | 是 | - | My Hours 登录密码 | | schema | Config | 否 | - | HTTP 和 SeaTunnel 数据结构映射。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | json_field | Config | 否 | - | 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 | | content_json | String | 否 | - | 此参数可以获取一些 JSON 数据。 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法。 | | headers | Map | 否 | - | HTTP 请求头 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | poll_interval_millis | Int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | Int | 否 | - | 如果 HTTP 请求返回 `IOException` 的最大重试次数 | | retry_backoff_multiplier_ms | Int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | Int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | Boolean | 否 | false | 是否启用多行模式 | | common-options | | 否 | - | 源插件通用参数 | ## 如何创建 My Hours 数据同步作业 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { MyHours{ url = "https://api2.myhours.com/api/Projects/getAll" email = "seatunnel@test.com" password = "seatunnel" schema { fields { name = string archived = boolean dateArchived = string dateCreated = string clientName = string budgetAlertPercent = string budgetType = int totalTimeLogged = double budgetValue = double totalAmount = double totalExpense = double laborCost = double totalCost = double billableTimeLogged = double totalBillableAmount = double billable = boolean roundType = int roundInterval = int budgetSpentPercentage = double budgetTarget = int budgetPeriodType = string budgetSpent = string id = string } } } } # 控制台打印读取的数据 sink { Console { parallelism = 1 } } ``` ## 参数解释 ### format 当您指定格式为 `json` 时,您还应该指定 schema 选项。 ### content_json 此参数可以获取一些 JSON 数据。如果您只需要 'book' 部分中的数据,配置 `content_field = "$.store.book.*"`。 ### json_field 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/MySQL-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-mysql.md'; # MySQL CDC > MySQL CDC source 连接器 ## 支持这些引擎 > SeaTunnel Zeta
    > Flink
    ## 描述 MySQL CDC连接器允许从MySQL数据库读取快照和增量数据. 本文档描述了如何配置MySQL CDC连接器以对MySQL数据库运行SQL查询. ## 主要功能 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持自定义分片](../../introduction/concepts/connector-v2-features.md) ## 支持的数据源信息 | 数据源 | 支持的版本 | Driver | Url | Maven | |------------|------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------|----------------------------------------------------------------------| | MySQL |
  • [MySQL](https://dev.mysql.com/doc): 5.5, 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 | ## 依赖 ### 安装Jdbc驱动 #### 对于Flink引擎 > 1. 你需要确保 [jdbc 驱动 jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已经放在目录 `${SEATUNNEL_HOME}/plugins/`. #### 对于SeaTunnel Zeta引擎 > 1. 你需要确保 [jdbc 驱动 jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已经放在目录 `${SEATUNNEL_HOME}/lib/`. ### 创建MySQL用户 你必须定义一个MySQL用户,该用户对Debezium MySQL连接器所监控的所有数据库拥有适当的权限. 1. 创建MySQL用户: ```sql mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password'; ``` 2. 给用户赋予所需权限: ```sql mysql> GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password'; ``` 3. 最终确定用户权限: ```sql mysql> FLUSH PRIVILEGES; ``` ### 启用MySQL Binlog 一定要为MySQL复制启用binlog。binlog记录事务更新以供复制工具传播更改. 1. 检查`log-bin`是否已经设置为on: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | ON | | gtid_mode | ON | | log_bin | ON | +--------------------------+----------------+ ``` 2. 如果`log_bin`的值不是`on`, 配置你的MySQL server配置文件(`$MYSQL_HOME/mysql.cnf`),配置文件中包含以下属性,这些属性在以下表格中有描述: ``` # Enable binary replication log and set the prefix, expiration, and log format. # The prefix is arbitrary, expiration can be short for integration tests but would # be longer on a production system. Row-level info is required for ingest to work. # Server ID is required, but this will vary on production systems server-id = 223344 log_bin = mysql-bin expire_logs_days = 10 binlog_format = row # mysql 5.6+ requires binlog_row_image to be set to FULL binlog_row_image = FULL # optional enable gtid mode # mysql 5.6+ requires gtid_mode to be set to ON, but not required by mysql 8.0+ gtid_mode = on enforce_gtid_consistency = on ``` 3. 重启MySQL Server ```shell /etc/inint.d/mysqld restart ``` 4. 修改之后再检查一次binlog的状态: MySQL 5.5: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | log_bin | ON | +--------------------------+----------------+ ``` MySQL 5.6+: ```sql mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency'); +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | ON | | gtid_mode | ON | | log_bin | ON | +--------------------------+----------------+ ``` MySQL 8.0+: ```sql show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency') +--------------------------+----------------+ | Variable_name | Value | +--------------------------+----------------+ | binlog_format | ROW | | binlog_row_image | FULL | | enforce_gtid_consistency | OFF | | gtid_mode | OFF | | log_bin | ON | +--------------------------+----------------+ ``` ### 提示 #### 配置MySQL session超时时长 当为大型数据库初始一致快照时,已建立的连接可能在读取表时超时。可以通过在MySQL配置文件中配置interactive_timeout(交互超时时间)和wait_timeout(等待超时时间)来防止这种行为. - `interactive_timeout`: 服务器在关闭交互连接之前等待活动(交互操作)的秒数. 详见 [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout). - `wait_timeout`: 服务器在关闭非交互式连接之前等待其活动的秒数. 详见 [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout). *更多的数据库配置,见 [Debezium MySQL Connector](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#setting-up-mysql)* ## 数据类型映射 | Mysql数据类型 | SeaTunnel数据类型 | |------------------------------------------------------------------------------------------------|---------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | TINYINT | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(p, s)
    DECIMAL(p, s) UNSIGNED
    NUMERIC(p, s)
    NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED
    REAL
    REAL UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    ENUM
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | BINARY
    VARBINAR
    BIT(p)
    TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    GEOMETRY | BYTES | ## 配置参数选项 | 参数名称 | 类型 | 是否必须 | 默认值 | 描述 | |-------------------------------------------|----------|------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC连接的URL. 例如: `jdbc:mysql://localhost:3306/test`. | | username | String | 是 | - | 用来连接到数据库服务的数据库名称. | | password | String | 是 | - | 连接到数据库服务所使用的密码. | | database-names | List | 否 | - | 要监控的数据库名称. | | database-pattern | String | 否 | .* | 要捕获的数据库名称的正则表达式, 例如: `database_prefix.*`. | | table-names | List | 是 | - | 要监控的表名. 表名需要包括库名, 例如: `database_name.table_name` | | table-pattern | String | 是 | - | 要捕获的表名称的正则表达式. 表名需要包括库名, 例如: `database.*\\.table_.*` | | table-names-config | List | 否 | - | 表配置的列表集合. 例如: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | startup.mode | Enum | 否 | INITIAL | MySQL CDC 消费者的可选启动模式, 有效枚举值为 `initial`, `earliest`, `latest` , `specific` 和 `timestamp`.
    `initial`: 启动时同步历史数据, 然后同步增量数据.
    `earliest`: 从尽可能最早的偏移量开始启动.
    `latest`: 从最近的偏移量启动.
    `specific`: 从用户提供的特定偏移量开始启动.
    `timestamp`: 从用户提供的特定时间戳开始启动. | | startup.specific-offset.file | String | 否 | - | 从指定的binlog日志文件名开始. **注意, 当使用 `startup.mode` 选项为 `specific` 时,此选项为必填项.** | | startup.specific-offset.pos | Long | 否 | - | 从指定的binlog日志文件位置开始. **注意, 当使用 `startup.mode` 选项为 `specific` 时,此选项为必填项.** | | startup.timestamp | Long | No | - | 从指定的binlog时间戳文件位置开始. **注意, 当使用 `startup.mode` 选项为 `timestamp` 时,此选项为必填项.** | | stop.mode | Enum | 否 | NEVER | MySQL CDC 消费者的可选停止模式, 有效枚举值为 `never`, `latest` 和 `specific`.
    `never`: 实时任务一直运行不停止.
    `latest`: 从最新的偏移量处停止.
    `specific`: 从用户提供的特定偏移量处停止. | | stop.specific-offset.file | String | 否 | - | 从指定的binlog日志文件名停止. **注意, 当使用 `stop.mode` 选项为 `specific` 时,此选项为必填项.** | | stop.specific-offset.pos | Long | 否 | - | 从指定的binlog日志文件位置停止. **注意, 当使用 `stop.mode` 选项为 `specific` 时,此选项为必填项.** | | snapshot.split.size | Integer | 否 | 8096 | 表快照的分割大小(行数),读取表的快照时,被捕获的表会被分割成多个分割块. | | snapshot.fetch.size | Integer | 否 | 1024 | 每次轮询读取表快照时的最大获取大小. | | server-id | String | 否 | - | 此数据库客户端的数字 ID 或数字 ID 范围, 数字 ID 的语法如 `5400`, 数字 ID 范围的语法如 '5400-5408'.
    每个 ID 在 MySQL 集群中所有当前正在运行的数据库进程里必须是唯一的. 此连接加入
    MySQL服务以另外一个服务的身份 (带有此唯一 ID) 以便于能够读取binlog.
    默认情况下, 会生成一个介于 6500 到 2,148,492,146 之间的数字, 然而我们建议设置一个明确的值. | | server-time-zone | String | 否 | UTC | 数据库服务中的会话时区. 如果没设置, 使用 ZoneId.systemDefault() 来确定服务的时区. | | connect.timeout.ms | Duration | 否 | 30000 | 连接器在尝试连接数据库服务器后,在超时之前应等待的最长时间. | | connect.max-retries | Integer | 否 | 3 | 连接器在构建数据库服务器连接时应重试的最大重试次数. | | connection.pool.size | Integer | 否 | 20 | jdbc连接池大小. | | chunk-key.even-distribution.factor.upper-bound | Double | 否 | 100 | 块键分布因子的上限. 该因子用于确定表数据是否分布均匀. 如果分布式因子计算结果小于或等于此上限 (即., (MAX(id) - MIN(id) + 1) / row count), 表的分块将被优化以实现均匀分布. 否则, 如果分布因子大于此上限, 该表将被视为分布不均, 并且如果估计的分片数量超过 `sample-sharding.threshold` 所指定的值, 则将使用基于采样的分片策略. 默认值是100.0. | | chunk-key.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 块键分布因子的下限. 该因子用于确定表数据是否分布均匀. 如果计算得出的分布因子大于或等于此下限 (即., (MAX(id) - MIN(id) + 1) / row count), 表的分块将被优化以实现均匀分布. 否则, 如果分布因子小于此下限, 该表将被视为分布不均, 并且如果预估的分片数量超过了 `sample-sharding.threshold` 所指定的值,则将使用基于采样的分片策略. 默认值是 0.05. | | sample-sharding.threshold | Integer | 否 | 1000 | 此配置指定了触发采样分片策略的预估分片数量阈值. 当分配因子超出由 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 所指定的范围时, 如果估计的分片数量 (按近似行数/块大小 计算) 超过此阈值, 则将使用样本分片策略. 这有助于更高效地处理大型数据集. 默认值为 1000 分片. | | inverse-sampling.rate | Integer | 否 | 1000 | 采样分片策略中使用的采样率的倒数. 例如, 如果该值设置为 1000, 则表示在采样过程中应用了 1/1000 的采样率. 此选项在控制采样的粒度方面提供了灵活性, 从而影响最终的分片数量. 在处理非常大的数据集时非常有用, 因为此时更倾向于使用较低的采样率. 默认值为 1000. | | exactly_once | Boolean | 否 | false | 启用精确一次语义. | | format | Enum | 否 | DEFAULT | MySQL CDC 的可选输出格式, 有效的枚举值为 `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. | | schema-changes.enabled | Boolean | 否 | false | 模式演进默认是禁用的. 当前我们只支持 `add column`、`drop column`、`rename column` 和 `modify column`. | | debezium | Config | 否 | - | 传递 [Debezium的属性](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/mysql.adoc#connector-properties) 给Debezium嵌入式引擎, 该引擎用于捕获 MySQL 服务的数据变更. | | int_type_narrowing | Boolean | 否 | true | Int类型收窄,如果为 true,则 tinyint(1) 类型将被收窄为 boolean 类型(如果没有精度损失)。目前仅支持 MySQL。 | | common-options | | 否 | - | Source插件通用参数, 详见 [Source Common Options](../common-options/source-common-options.md) | ### int_type_narrowing Int类型收窄,如果为 true,则 tinyint(1) 类型将被收窄为 boolean 类型(如果没有精度损失)。目前仅支持 MySQL。 例: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ## 任务示例 ### 简单的示例 > 支持多表读取 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 10000 } source { MySQL-CDC { url = "jdbc:mysql://localhost:3306/testdb" username = "root" password = "root@123" table-names = ["testdb.table1", "testdb.table2"] startup.mode = "initial" } } sink { Console { } } ``` ### 支持向Kafka发送与Debezium兼容的格式 > 一定是使用kafka作为sink, 详见 [compatible debezium format](../formats/cdc-compatible-debezium-json.md) ### 支持表的自定义主键 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 10000 } source { MySQL-CDC { url = "jdbc:mysql://localhost:3306/testdb" username = "root" password = "root@123" table-names = ["testdb.table1", "testdb.table2"] table-names-config = [ { table = "testdb.table2" primaryKeys = ["id"] } ] } } sink { Console { } } ``` ### 支持模式演变(表结构变更) ``` env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = shop table = mysql_cdc_e2e_sink_table_with_schema_change_exactly_once primary_keys = ["id"] is_exactly_once = true xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### 表名支持正则以读取多个表 > `table-pattern` 和 `table-names` 只能选择一个 ```hocon env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652 username = "st_user_source" password = "mysqlpw" database-pattern = "source.*" table-pattern = "source.*\\..*" url = "jdbc:mysql://mysql_cdc_e2e:3306" } } sink { Console { } } ``` ## 更新日志 ================================================ FILE: docs/zh/connectors/source/Mysql.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # MySQL > JDBC Mysql 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持 Mysql 版本 - 5.5/5.6/5.7/8.0/8.1/8.2/8.3/8.4 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) - [x] [支持多表读取](../../introduction/concepts/connector-v2-features.md) > 支持 SQL 查询,并能实现列投影效果 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动器 | 网址 | Maven下载链接 | |-----|---------------------------------------------------------|--------------------------|---------------------------------------|---------------------------------------------------------------------| | Mysql | 不同的依赖版本具有不同的驱动程序类。 | com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306:3306/test | [下载](https://mvnrepository.com/artifact/mysql/mysql-connector-java) | ## 数据类型映射 | Mysql 数据类型 | SeaTunnel 数据类型 | |---------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | BYTE | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(获取指定列的列大小<38) | DECIMAL(x,y) | | DECIMAL(x,y)(获取指定列的列大小>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL((获取指定列的列大小)+1,
    (获取指定列的小数点右侧的位数)) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | ## 数据源参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |--------------------------------------------|------------|------|-------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参见示例:
    `jdbc:mysql://localhost:3306:3306/test`。 | | driver | String | 是 | - | 用于连接远程数据源的 JDBC 类名,
    如果使用 MySQL,值为 `com.mysql.cj.jdbc.Driver`。 | | username | String | 否 | - | 连接实例用户名。 | | password | String | 否 | - | 连接实例密码。 | | query | String | 是 | - | 查询语句。 | | connection_check_timeout_sec | Int | 否 | 30 | 验证数据库连接所使用的操作完成的等待时间(秒)。 | | partition_column | String | 否 | - | 用于并行度分区的列名,仅支持数字类型,仅支持数字类型的主键,并且只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | 扫描时 `partition_column` 的最小值,如果未设置,`SeaTunnel` 将查询数据库以获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | 扫描时 `partition_column` 的最大值,如果未设置,`SeaTunnel` 将查询数据库以获取最大值。 | | partition_num | Int | 否 | 作业并行度 | 分区数量,仅支持正整数。
    默认值为作业并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,可以配置查询的行提取大小,以通过减少满足选择条件所需的数据库访问次数来提高性能。
    设置为零表示使用 `JDBC` 的默认值。 | | properties | Map | 否 | - | 额外的连接配置参数,当属性和 URL 中有相同的参数时,优先级由驱动程序的具体实现决定。
    例如,在 MySQL 中,属性优先于 URL。 | | use_regex | Boolean | 否 | false | 控制表路径的正则表达式匹配。当设置为true时,table_path 将被视为正则表达式模式。当设置为false或未指定时,table_path 将被视为精确路径(不进行正则匹配)。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    "testdb.table1" | | table_list | Array | 否 | - | 要读取的表的列表,您可以使用此配置代替 `table_path`,示例如下: ```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。例如 `where id > 100`。 | | split.size | Int | 否 | 8096 | 表的分割大小(行数),当读取表时,捕获的表会被分割成多个分片。 | | split.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 分片键分布因子的下限。该因子用于判断表数据的分布是否均匀。如果计算得到的分布因子大于或等于该下限(即,(MAX(id) - MIN(id) + 1) / 行数),则会对表的分片进行优化,以确保数据的均匀分布。反之,如果分布因子较低,则表数据将被视为分布不均匀。如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,则会采用基于采样的分片策略。默认值为 0.05。 | | split.even-distribution.factor.upper-bound | Double | 否 | 100 | 分片键分布因子的上限。该因子用于判断表数据的分布是否均匀。如果计算得到的分布因子小于或等于该上限(即,(MAX(id) - MIN(id) + 1) / 行数),则会对表的分片进行优化,以确保数据的均匀分布。反之,如果分布因子较大,则表数据将被视为分布不均匀,并且如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,则会采用基于采样的分片策略。默认值为 100.0。 | | split.sample-sharding.threshold | Int | 否 | 10000 | 此配置指定了触发样本分片策略的估算分片数阈值。当分布因子超出由 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估算的分片数量(计算方法为大致行数 / 分片大小)超过此阈值时,将使用样本分片策略。此配置有助于更高效地处理大型数据集。默认值为 1000 个分片。 | | split.inverse-sampling.rate | Int | 否 | 1000 | 样本分片策略中使用的采样率的倒数。例如,如果该值设置为 1000,则表示在采样过程中应用 1/1000 的采样率。此选项提供了灵活性,可以控制采样的粒度,从而影响最终的分片数量。特别适用于处理非常大的数据集,在这种情况下通常会选择较低的采样率。默认值为 1000。 | | int_type_narrowing | Boolean | 否 | true | Int类型收窄,如果为 true,则 tinyint(1) 类型将被收窄为 boolean 类型(如果没有精度损失)。目前仅支持 MySQL。 | | common-options | | 否 | - | 源插件的常见参数,请参阅 [源常见参数](../common-options/source-common-options.md) 了解详细信息。 | ### int_type_narrowing Int类型收窄,如果为 true,则 tinyint(1) 类型将被收窄为 boolean 类型(如果没有精度损失)。目前仅支持 MySQL。 例: int_type_narrowing = true | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | Boolean | int_type_narrowing = false | MySQL | SeaTunnel | |------------|-----------| | TINYINT(1) | TINYINT | ## 并行读取器 JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用特定规则将表中的数据进行分割,然后将这些数据交给读取器进行读取。读取器的数量由 `parallelism` 选项决定。 **拆分键规则:** 1. 如果 `partition_column` 不为空,它将用于计算数据的分片。该列必须属于 **支持的分片数据类型**。 2. 如果 partition_column 为空,SeaTunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多个列,则会选择第一个属于 **支持的分片数据类型** 的列来进行数据分片。例如,如果表的主键是 `(nn guid, name varchar)`,因为 `guid` 不属于 **支持的分片数据类型**,所以会选择列 `name` 来进行数据分片。 **支持的拆分数据类型:** * String * Number(int, bigint, decimal, ...) * Date ### 与拆分相关的参数 #### split.size 每个分片中的行数,捕获的表在读取时会被分成多个分片。 #### split.even-distribution.factor.lower-bound > 不推荐使用 分片键分布因子的下限。该因子用于判断表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即,(最大(id) - 最小(id) + 1)/ 行数),则表的分片将被优化为均匀分布。否则,如果分布因子较小,则表的数据将被认为是不均匀分布的。如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,将使用基于采样的分片策略。默认值为 0.05。 #### split.even-distribution.factor.upper-bound > 不推荐使用 分片键分布因子的上限。该因子用于判断表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即,(最大(id) - 最小(id) + 1)/ 行数),则表的分片将被优化为均匀分布。否则,如果分布因子较大,则表的数据将被认为是不均匀分布的。如果估算的分片数量超过 `sample-sharding.threshold` 所指定的值,将使用基于采样的分片策略。默认值为 100.0。 #### split.sample-sharding.threshold 此配置指定了触发采样分片策略的估算分片数量阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估算的分片数量(按大致行数除以分片大小计算)超过该阈值时,将使用采样分片策略。这有助于更高效地处理大数据集。默认值为 1000 个分片。 #### split.inverse-sampling.rate 采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了灵活性,可以控制采样的粒度,从而影响最终的分片数量。在处理非常大的数据集时,较低的采样率通常是首选。默认值为 1000。 #### partition_column [string] 拆分数据的列名称。 #### partition_upper_bound [BigDecimal] 扫描时 `partition_column` 的最大值。如果未设置,SeaTunnel 将查询数据库以获取最大值。 #### partition_lower_bound [BigDecimal] 扫描时 `partition_column` 的最小值。如果未设置,SeaTunnel 将查询数据库以获取最小值。 #### partition_num [int] > 不推荐使用,正确的方法是通过 `split.size` 来控制分片的数量。 需要拆分成多少个分片,只支持正整数。默认值为作业并行度。 ## 提示 > 如果表无法拆分(例如,表没有主键或唯一索引,且未设置 `partition_column`),则将以单线程并发方式运行。 > > 使用 `table_path` 替代 `query` 来进行单表读取。如果需要读取多个表,请使用 `table_list`。 > 当基于 `query` 推断主键时,主键继承自结果集中第一列所在的底层表;如果 `query` 包含多表 JOIN 或同时从多张表读取,该主键对整个 JOIN 结果集的唯一性不作严格保证。 ## 任务示例 ### 简单的例子 > 这个示例以单线程并行的方式查询测试数据库中 `type_bin` 为 'table' 的16条数据,并查询所有字段。你也可以指定查询哪些字段,并将最终结果输出到控制台。 ``` # 定义运行时环境 env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # 如果您想了解更多关于如何配置 SeaTunnel 的信息,并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 按 `partition_column` 并行 ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin" partition_column = "id" split.size = 10000 # Read start boundary #partition_lower_bound = ... # Read end boundary #partition_upper_bound = ... } } sink { Console {} } ``` ### 按主键或唯一索引并行 > 配置 `table_path` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略 ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_path = "testdb.table1" query = "select * from testdb.table1" split.size = 10000 } } sink { Console {} } ``` ### 并行的同时指定边界 > 指定数据的上下边界查询会更加高效。根据您配置的上下边界读取数据源会更高效。 ``` source { Jdbc { url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # Define query logic as required query = "select * from type_bin" partition_column = "id" # Read start boundary partition_lower_bound = 1 # Read end boundary partition_upper_bound = 500 partition_num = 10 properties { useSSL=false } } } ``` ### 多表读取 ***配置 `table_list` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_list = [ { table_path = "testdb.table1" }, { table_path = "testdb.table2" # Use query filetr rows & columns query = "select id, name from testdb.table2 where id > 100" } ] #where_condition= "where id > 100" #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Neo4j.md ================================================ import ChangeLog from '../changelog/connector-neo4j.md'; # Neo4j > Neo4j 源连接器器 ## 描述 从 `Neo4j` 读取数据 `neo4j-java-driver` 版本 4.4.9 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义拆分](../../introduction/concepts/connector-v2-features.md) ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | |----------------------------|--------|------|-----| | uri | String | 是 | - | | username | String | 否 | - | | password | String | 否 | - | | bearer_token | String | 否 | - | | kerberos_ticket | String | 否 | - | | database | String | 是 | - | | query | String | 是 | - | | schema | Object | 是 | - | | max_transaction_retry_time | Long | 否 | 30 | | max_connection_timeout | Long | 否 | 30 | ### uri [string] `Neo4j`数据库的URI,参考配置: `neo4j://localhost:7687`。 ### username [string] `Neo4j`用户名。 ### password [string] `Neo4j`密码。如果提供了“用户名”,则需要。 ### bearer_token [string] `Neo4j`的`base64`编码`bearer token`用于鉴权。 ### kerberos_ticket [string] `Neo4j`的`base64`编码`kerberos ticket`用于鉴权。 ### database [string] 数据库名。 ### query [string] 查询语句。 ### schema.fields [string] 返回`query` 的字段。 查看 [列投影](../../introduction/concepts/connector-v2-features.md) ### max_transaction_retry_time [long] 最大事务重试时间(秒)。如果超过,则事务失败。 ### max_connection_timeout [long] 等待TCP连接建立的最长时间(秒)。 ## 示例 ``` source { Neo4j { uri = "neo4j://localhost:7687" username = "neo4j" password = "1234" database = "neo4j" max_transaction_retry_time = 1 max_connection_timeout = 1 query = "MATCH (a:Person) RETURN a.name, a.age" schema { fields { a.age=INT a.name=STRING } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Notion.md ================================================ import ChangeLog from '../changelog/connector-http-notion.md'; # Notion > Notion 源连接器 ## 描述 用于从 Notion 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | HTTP 请求 URL | | password | String | 是 | - | API 密钥用于登录 | | version | String | 是 | - | Notion API 版本 | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | json_field | Config | 否 | - | JSON 字段配置 | | content_json | String | 否 | - | 内容 JSON 配置 | | poll_interval_millis | int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | int | 否 | - | 如果 HTTP 请求返回 `IOException` 的最大重试次数 | | retry_backoff_multiplier_ms | int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | boolean | 否 | false | 是否启用多行模式 | | common-options | config | 否 | - | 源插件通用参数 | ### url [String] HTTP 请求 URL ### password [String] API 密钥用于登录,您可以在以下链接获取更多详情: https://developers.notion.com/docs/authorization ### version [String] Notion API 是版本化的。API 版本以发布版本的日期命名 ### method [String] HTTP 请求方法,仅支持 GET、POST 方法 ### params [Map] HTTP 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 HTTP API 的间隔(毫秒) ### retry [int] 如果 HTTP 请求返回 `IOException` 的最大重试次数 ### retry_backoff_multiplier_ms [int] HTTP 请求失败时的重试退避倍数(毫秒) ### retry_backoff_max_ms [int] HTTP 请求失败时的最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 JSON 数据。 ### json_field [Config] 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 ## 变更日志 ================================================ FILE: docs/zh/connectors/source/ObsFile.md ================================================ import ChangeLog from '../changelog/connector-file-obs.md'; # ObsFile > Obs 文件源连接器 ## 支持这些引擎 > Spark > > Flink > > Seatunnel Zeta ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [多模态](../../introduction/concepts/connector-v2-features.md#multimodal) 使用二进制文件格式读写任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在一次 pollNext 调用中读取分割中的所有数据。读取哪些分割将保存在快照中。 - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] markdown ## 描述 从华为云 OBS 文件系统读取数据。 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已集成 hadoop。测试的 hadoop 版本是 2.x。 如果您使用 SeaTunnel 引擎,它会在您下载和安装 SeaTunnel 引擎时自动集成 hadoop jar。您可以检查 ${SEATUNNEL_HOME}/lib 下的 jar 包来确认这一点。 我们为了支持更多文件类型做了一些权衡,所以我们使用 HDFS 协议来内部访问 OBS,此连接器需要一些 hadoop 依赖项。 它仅支持 hadoop 版本 **2.9.X+**。 ## 必需的 Jar 列表 | jar | 支持的版本 | maven | |-----|-----------|-------| | hadoop-huaweicloud | 支持版本 >= 3.1.1.29 | [下载](https://repo.huaweicloud.com/artifactory/sdk_public/org/apache/hadoop/hadoop-huaweicloud/) | | esdk-obs-java | 支持版本 >= 3.19.7.3 | [下载](https://repo.huaweicloud.com/artifactory/sdk_public/com/huawei/storage/esdk-obs-java/) | | okhttp | 支持版本 >= 3.11.0 | [下载](https://repo1.maven.org/maven2/com/squareup/okhttp3/okhttp/) | | okio | 支持版本 >= 1.14.0 | [下载](https://repo1.maven.org/maven2/com/squareup/okio/okio/) | > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录。 > > 并将所有 jar 复制到 $SEATUNNEL_HOME/lib/ ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |---------------------------|---------|----|---------------------|-----------------------------------------| | path | string | 是 | - | 目标目录路径 | | file_format_type | string | 是 | - | 文件类型 | | bucket | string | 是 | - | OBS 文件系统的桶地址,例如:`obs://obs-bucket-name` | | access_key | string | 是 | - | OBS 文件系统的访问密钥 | | access_secret | string | 是 | - | OBS 文件系统的访问密钥 | | endpoint | string | 是 | - | OBS 文件系统的端点 | | read_columns | list | 是 | - | 数据源的读取列列表 | | delimiter | string | 否 | \001 | 字段分隔符 | | row_delimiter | string | 否 | \n | 行分隔符 | | parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径解析分区键和值 | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于 txt 和 csv。 | | date_format | string | 否 | yyyy-MM-dd | 日期类型格式 | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式 | | time_format | string | 否 | HH:mm:ss | 时间类型格式 | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | ## 变更日志 ================================================ FILE: docs/zh/connectors/source/OceanBase.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # OceanBase > JDBC OceanBase 源连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | OceanBase | 所有 OceanBase 服务器版本 | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [下载](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如:cp oceanbase-client-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 ### MySQL 模式 | MySQL 数据类型 | SeaTunnel 数据类型 | |---------------|------------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | BYTE | | TINYINT
    TINYINT UNSIGNED | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(<38) | DECIMAL(x,y) | | DECIMAL(x,y)(>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON
    ENUM | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n)
    GEOMETRY | BYTES | ### Oracle 模式 | Oracle 数据类型 | SeaTunnel 数据类型 | |---------------|------------------| | Integer | DECIMAL(38,0) | | Number(p), p <= 9 | INT | | Number(p), p <= 18 | BIGINT | | Number(p), p > 18 | DECIMAL(38,18) | | Number(p,s) | DECIMAL(p,s) | | Float | DECIMAL(38,18) | | REAL
    BINARY_FLOAT | FLOAT | | BINARY_DOUBLE | DOUBLE | | CHAR
    NCHAR
    VARCHAR
    VARCHAR2
    NVARCHAR2
    NCLOB
    CLOB
    LONG
    XML
    ROWID | STRING | | DATE | TIMESTAMP | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | | UNKNOWN | 暂不支持 | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:oceanbase://localhost:2883/test | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,应为 `com.oceanbase.jdbc.Driver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | compatible_mode | String | 是 | - | OceanBase 的兼容模式,可以是 'mysql' 或 'oracle'。 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行性分割的列名,仅支持数值类型列和字符串类型列。 | | partition_lower_bound | BigDecimal | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分割数量,仅支持正整数。默认值是任务并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当 properties 和 URL 具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 MySQL 中,properties 优先于 URL。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### 提示 > 如果未设置 partition_column,它将以单并发运行,如果设置了 partition_column,它将根据任务的并发度并行执行。 ## 任务示例 ### 简单 ``` env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transform/sql } sink { Console {} } ``` ### 并行 > 使用您配置的分片字段和分片数据并行读取查询表。如果您想读取整个表,可以这样做 ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } } sink { Console {} } ``` ### 并行边界 > 根据您配置的上下边界读取数据源更高效 ``` source { Jdbc { driver = "com.oceanbase.jdbc.Driver" url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" username = "root" password = "" compatible_mode = "mysql" query = "select * from source" partition_column = "id" partition_num = 10 # 读取开始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/OneSignal.md ================================================ import ChangeLog from '../changelog/connector-http-onesignal.md'; # OneSignal > OneSignal 源连接器 ## 描述 用于从 OneSignal 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-----------------------------|---------|----|-------|---------------------------------------------------------------------------------------------| | url | String | 是 | - | HTTP 请求 URL | | password | String | 是 | - | 认证密钥用于登录 | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法 | | schema | Config | 否 | - | HTTP 和 SeaTunnel 数据结构映射。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | json_field | Config | 否 | - | JSON 字段配置 | | content_json | String | 否 | - | 内容 JSON 配置 | | poll_interval_millis | int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | int | 否 | - | 如果 HTTP 请求返回 `IOException` 的最大重试次数 | | retry_backoff_multiplier_ms | int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | boolean | 否 | false | 是否启用多行模式 | | common-options | config | 否 | - | 源插件通用参数 | ### url [String] HTTP 请求 URL ### password [String] 认证密钥用于登录,您可以在以下链接获取更多详情: https://documentation.onesignal.com/docs/accounts-and-keys#user-auth-key ### method [String] HTTP 请求方法,仅支持 GET、POST 方法 ### params [Map] HTTP 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 HTTP API 的间隔(毫秒) ### retry [int] 如果 HTTP 请求返回 `IOException` 的最大重试次数 ### retry_backoff_multiplier_ms [int] HTTP 请求失败时的重试退避倍数(毫秒) ### retry_backoff_max_ms [int] HTTP 请求失败时的最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 JSON 数据。 ### json_field [Config] 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 示例 ```hocon source { OneSignal { url = "https://onesignal.com/api/v1/apps" password = "SeaTunnel-test" schema = { fields { id = string name = string gcm_key = string chrome_key = string created_at = string updated_at = string players = int messageable_players = int basic_auth_key = string } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/OpenMldb.md ================================================ import ChangeLog from '../changelog/connector-openmldb.md'; # OpenMldb > OpenMldb 源连接器 ## 描述 用于从 OpenMldb 读取数据. ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必需 | 默认值 | |-----------------|---------|----------|---------------| | cluster_mode | boolean | 是 | - | | sql | string | 是 | - | | database | string | 是 | - | | host | string | 否 | - | | port | int | 否 | - | | zk_path | string | 否 | - | | zk_host | string | 否 | - | | session_timeout | int | 否 | 10000 | | request_timeout | int | 否 | 60000 | | common-options | | 否 | - | ### cluster_mode [string] OpenMldb 是否处于群集模式 ### sql [string] Sql 语句 ### database [string] 数据库名称 ### host [string] OpenMldb主机,仅支持OpenMldb单模 ### port [int] OpenMldb端口,仅支持OpenMldb单模 ### zk_host [string] Zookeeper主机,仅在OpenMldb集群模式下受支持 ### zk_path [string] Zookeeper路径,仅在OpenMldb集群模式下受支持 ### session_timeout [int] OpenMldb会话超时(ms),默认值60000 ### request_timeout [int] OpenMldb请求超时(ms),默认值为10000 ### common options 源插件常用参数, 详见 [Source Common Options](../common-options/source-common-options.md) ## 示例 ```hocon OpenMldb { host = "172.17.0.2" port = 6527 sql = "select * from demo_table1" database = "demo_db" cluster_mode = false } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Opengauss-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-opengauss.md'; # Opengauss CDC > Opengauss CDC源连接器 ## 支持这些引擎 > SeaTunnel Zeta
    > Flink
    ## 主要功能 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 描述 Opengauss CDC连接器允许从Opengauss数据库读取快照数据和增量数据。这个文档描述如何设置Opengauss CDC连接器以在Opengauss database中运行SQL查询。 ## 使用步骤 > 这里是启用Opengauss CDC的步骤: 1. 确保wal_level被设置为logical, 你可以直接使用SQL命令来修改这个配置: ```sql ALTER SYSTEM SET wal_level TO 'logical'; SELECT pg_reload_conf(); ``` 2. 改变指定表的REPLICA策略为FULL ```sql ALTER TABLE your_table_name REPLICA IDENTITY FULL; ``` 如果你有很多表,你可以使用下面SQL的结果集来改变所有表的REPLICA策略 ```sql select 'ALTER TABLE ' || schemaname || '.' || tablename || ' REPLICA IDENTITY FULL;' from pg_tables where schemaname = 'YourTableSchema' ``` ## 数据类型映射 | Opengauss Data type | SeaTunnel Data type | |-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | BYTEA
    | BYTES | | INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | | INT8
    BIGSERIAL
    | BIGINT | | FLOAT4
    | FLOAT | | FLOAT8
    | DOUBLE | | NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | | NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB | STRING | | TIMESTAMP
    | TIMESTAMP | | TIME
    | TIME | | DATE
    | DATE | | OTHER DATA TYPES | NOT SUPPORTED YET | ## 源端可选项 | Name | Type | Required | Default | Description | |-------------------------------------------|------|----------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | 字符串 | 是 | - | JDBC连接的URL. 参考: `jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`. | | username | 字符串 | 是 | - | 连接数据库的用户名 | | password | 字符串 | 是 | - | 连接数据库的密码 | | database-names | 列表 | 否 | - | 监控的数据库名称 | | table-names | 列表 | 是 | - | 监控的数据表名称. 表名需要包含数据库名称, 例如: `database_name.table_name` | | table-names-config | 列表 | 否 | - | 表配置的列表集合. 例如: [{"table": "db1.schema1.table1","primaryKeys":["key1"]}] | | startup.mode | 枚举 | 否 | INITIAL | Opengauss CDC消费者的可选启动模式, 有效的枚举是`initial`, `earliest`, `latest`.
    `initial`: 启动时同步历史数据,然后同步增量数据
    `earliest`: 从可能的最早偏移量启动
    `latest`: 从最近的偏移量启动 | | snapshot.split.size | 整型 | 否 | 8096 | 表快照的分割大小(行数),在读取表的快照时,捕获的表被分割成多个split | | snapshot.fetch.size | 整型 | 否 | 1024 | 读取表快照时,每次轮询的最大读取大小 | | slot.name | 字符串 | 否 | - | Opengauss逻辑解码插槽的名称,该插槽是为特定数据库/模式的特定插件的流式更改而创建的。服务器使用此插槽将事件流传输到正在配置的连接器。默认值为seatunnel | | decoding.plugin.name | 字符串 | 否 | pgoutput | 安装在服务器上的Postgres逻辑解码插件的名称,支持的值是decoderbufs、wal2json、wal2json_rds、wal2json_streaming、wal2json_rds_streaming和pgoutput | | server-time-zone | 字符串 | 否 | UTC | 数据库服务器中的会话时区。如果没有设置,则使用ZoneId.systemDefault()来确定服务器的时区 | | connect.timeout.ms | 时间间隔 | 否 | 30000 | 在尝试连接数据库服务器之后,连接器在超时之前应该等待的最大时间 | | connect.max-retries | 整型 | 否 | 3 | 连接器在建立数据库服务器连接时应该重试的最大次数 | | connection.pool.size | 整型 | 否 | 20 | jdbc连接池的大小 | | chunk-key.even-distribution.factor.upper-bound | 双浮点型 | 否 | 100 | chunk的key分布因子的上界。该因子用于确定表数据是否均匀分布。如果分布因子被计算为小于或等于这个上界(即(MAX(id) - MIN(id) + 1) /行数),表的所有chunk将被优化以达到均匀分布。否则,如果分布因子更大,则认为表分布不均匀,如果估计的分片数量超过`sample-sharding.threshold`指定的值,则将使用基于采样的分片策略。默认值为100.0。 | | chunk-key.even-distribution.factor.lower-bound | 双浮点型 | 否 | 0.05 | chunk的key分布因子的下界。该因子用于确定表数据是否均匀分布。如果分布因子的计算结果大于或等于这个下界(即(MAX(id) - MIN(id) + 1) /行数),那么表的所有块将被优化以达到均匀分布。否则,如果分布因子较小,则认为表分布不均匀,如果估计的分片数量超过`sample-sharding.threshold`指定的值,则使用基于采样的分片策略。缺省值为0.05。 | | sample-sharding.threshold | 整型 | 否 | 1000 | 此配置指定了用于触发采样分片策略的估计分片数的阈值。当分布因子超出了由`chunk-key.even-distribution.factor.upper-bound `和`chunk-key.even-distribution.factor.lower-bound`,并且估计的分片计数(以近似的行数/块大小计算)超过此阈值,则将使用样本分片策略。这有助于更有效地处理大型数据集。默认值为1000个分片。 | | inverse-sampling.rate | 整型 | 否 | 1000 | 采样分片策略中使用的采样率的倒数。例如,如果该值设置为1000,则意味着在采样过程中应用了1/1000的采样率。该选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。当处理非常大的数据集时,它特别有用,其中首选较低的采样率。缺省值为1000。 | | exactly_once | 布尔 | 否 | false | 启用exactly once语义 | | format | 枚举 | 否 | DEFAULT | Opengauss CDC可选的输出格式, 有效的枚举是`DEFAULT`, `COMPATIBLE_DEBEZIUM_JSON`. | | debezium | 配置 | 否 | - | 将 [Debezium的属性](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) 传递到Debezium嵌入式引擎,该引擎用于捕获来自Opengauss服务的数据更改 | | common-options | | 否 | - | 源码插件通用参数, 请参考[Source Common Options](../common-options/source-common-options.md)获取详情 | ## 任务示例 ### 简单 > 支持多表读 ``` env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { Opengauss-CDC { plugin_output = "customers_opengauss_cdc" username = "gaussdb" password = "openGauss@123" database-names = ["opengauss_cdc"] schema-names = ["inventory"] table-names = ["opengauss_cdc.inventory.opengauss_cdc_table_1","opengauss_cdc.inventory.opengauss_cdc_table_2"] url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc" decoding.plugin.name = "pgoutput" } } transform { } sink { jdbc { plugin_input = "customers_opengauss_cdc" url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc" driver = "org.postgresql.Driver" user = "dailai" password = "openGauss@123" compatible_mode="postgresLow" generate_sink_sql = true # You need to configure both database and table database = "opengauss_cdc" schema = "inventory" tablePrefix = "sink_" primary_keys = ["id"] } } ``` ### 支持自定义主键 ``` source { Opengauss-CDC { plugin_output = "customers_opengauss_cdc" username = "gaussdb" password = "openGauss@123" database-names = ["opengauss_cdc"] schema-names = ["inventory"] table-names = ["opengauss_cdc.inventory.full_types_no_primary_key"] url = "jdbc:postgresql://opengauss_cdc_e2e:5432/opengauss_cdc?loggerLevel=OFF" decoding.plugin.name = "pgoutput" exactly_once = true table-names-config = [ { table = "opengauss_cdc.inventory.full_types_no_primary_key" primaryKeys = ["id"] } ] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Oracle-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-oracle.md'; # Oracle CDC > Oracle CDC 数据源连接器 ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    ## 关键特性 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义拆分](../../introduction/concepts/connector-v2-features.md) ## 描述 Oracle CDC 连接器允许从 Oracle 数据库读取快照数据和增量数据。本文档描述了如何设置 Oracle CDC 连接器以针对 Oracle 数据库运行 SQL 查询。 ## 注意 Debezium Oracle 连接器不依赖于连续挖掘(continuous mining)选项。该连接器负责检测日志切换并自动调整正在挖掘的日志,这正是连续挖掘选项自动为您完成的工作。 因此,您不能在 debezium 中设置名为 `log.mining.continuous.mine` 的属性。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动类 | Url | Maven | |------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| | Oracle | 不同的依赖版本有不同的驱动类。 | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## 数据库依赖 ### 安装 Jdbc 驱动 #### 适用于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) 已放置在 `${SEATUNNEL_HOME}/plugins/` 目录下。 > 2. 为了支持 i18n 字符集,请将 `orai18n.jar` 复制到 `$SEATUNNEL_HOME/plugins/` 目录。 #### 适用于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) 已放置在 `${SEATUNNEL_HOME}/lib/` 目录下。 > 2. 为了支持 i18n 字符集,请将 `orai18n.jar` 复制到 `$SEATUNNEL_HOME/lib/` 目录。 ### 启用 Oracle Logminer > 要在 Seatunnel 中使用 Logminer(Oracle 提供的内置工具)启用 Oracle CDC(变更数据捕获),请按照以下步骤操作: #### 在非 CDB(容器数据库)模式下启用 Logminer。 1. 操作系统创建一个空的目录来存储 Oracle 归档日志和用户表空间。 ```shell mkdir -p /opt/oracle/oradata/recovery_area mkdir -p /opt/oracle/oradata/ORCLCDB chown -R oracle /opt/oracle/*** ``` 2. 以管理员身份登录并启用 Oracle 归档日志。 ```sql sqlplus /nolog; connect sys as sysdba; alter system set db_recovery_file_dest_size = 10G; alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; shutdown immediate; startup mount; alter database archivelog; alter database open; ALTER DATABASE ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; archive log list; ``` 3. 以管理员身份登录并创建一个名为 logminer_user 的账户,密码为 "oracle",并授予其读取表和日志的权限。 ```sql CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; CREATE USER logminer_user IDENTIFIED BY oracle DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs; GRANT CREATE SESSION TO logminer_user; GRANT SELECT ON V_$DATABASE to logminer_user; GRANT SELECT ON V_$LOG TO logminer_user; GRANT SELECT ON V_$LOGFILE TO logminer_user; GRANT SELECT ON V_$LOGMNR_LOGS TO logminer_user; GRANT SELECT ON V_$LOGMNR_CONTENTS TO logminer_user; GRANT SELECT ON V_$ARCHIVED_LOG TO logminer_user; GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO logminer_user; GRANT EXECUTE ON DBMS_LOGMNR TO logminer_user; GRANT EXECUTE ON DBMS_LOGMNR_D TO logminer_user; GRANT SELECT ANY TRANSACTION TO logminer_user; GRANT SELECT ON V_$TRANSACTION TO logminer_user; ``` ##### 注意:Oracle 11g 不支持以下命令 ```sql GRANT LOGMINING TO logminer_user; ``` ##### 仅授予需要采集的表的权限 ```sql GRANT SELECT ANY TABLE TO logminer_user; GRANT ANALYZE ANY TO logminer_user; ``` #### 在 Oracle CDB (容器数据库) + PDB (可插拔数据库) 模式下启用 Logminer 1. 操作系统创建一个空的目录来存储 Oracle 归档日志和用户表空间。 ```shell mkdir -p /opt/oracle/oradata/recovery_area mkdir -p /opt/oracle/oradata/ORCLCDB mkdir -p /opt/oracle/oradata/ORCLCDB/ORCLPDB1 chown -R oracle /opt/oracle/*** ``` 2. 以管理员身份登录并启用日志记录 ```sql sqlplus /nolog connect sys as sysdba; # 密码: oracle alter system set db_recovery_file_dest_size = 10G; alter system set db_recovery_file_dest = '/opt/oracle/oradata/recovery_area' scope=spfile; shutdown immediate startup mount alter database archivelog; alter database open; archive log list; ``` 3. 在 CDB 中执行 ```sql ALTER TABLE TEST.* ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; ALTER TABLE TEST.T2 ADD SUPPLEMENTAL LOG DATA (ALL) COLUMNS; ``` 4. 创建 debeziume 账户 > 在 CDB 中操作 ```sql sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; exit; ``` > 在 PDB 中操作 ```sql sqlplus sys/top_secret@//localhost:1521/ORCLPDB1 as sysdba CREATE TABLESPACE logminer_tbs DATAFILE '/opt/oracle/oradata/ORCLCDB/ORCLPDB1/logminer_tbs.dbf' SIZE 25M REUSE AUTOEXTEND ON MAXSIZE UNLIMITED; exit; ``` 5. 在 CDB 中操作 ```sql sqlplus sys/top_secret@//localhost:1521/ORCLCDB as sysdba CREATE USER c##dbzuser IDENTIFIED BY dbz DEFAULT TABLESPACE logminer_tbs QUOTA UNLIMITED ON logminer_tbs CONTAINER=ALL; GRANT CREATE SESSION TO c##dbzuser CONTAINER=ALL; GRANT SET CONTAINER TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$DATABASE to c##dbzuser CONTAINER=ALL; GRANT FLASHBACK ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE_CATALOG_ROLE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ANY TRANSACTION TO c##dbzuser CONTAINER=ALL; GRANT LOGMINING TO c##dbzuser CONTAINER=ALL; GRANT CREATE TABLE TO c##dbzuser CONTAINER=ALL; GRANT LOCK ANY TABLE TO c##dbzuser CONTAINER=ALL; GRANT CREATE SEQUENCE TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE ON DBMS_LOGMNR TO c##dbzuser CONTAINER=ALL; GRANT EXECUTE ON DBMS_LOGMNR_D TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOG TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOG_HISTORY TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_LOGS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_CONTENTS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGMNR_PARAMETERS TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$LOGFILE TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$ARCHIVED_LOG TO c##dbzuser CONTAINER=ALL; GRANT SELECT ON V_$ARCHIVE_DEST_STATUS TO c##dbzuser CONTAINER=ALL; GRANT analyze any TO debeziume_1 CONTAINER=ALL; exit; ``` ## 数据类型映射 | Oracle 数据类型 | SeaTunnel 数据类型 | |--------------------------------------------------------------------------------------|---------------------| | INTEGER | INT | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(precision == 0, scale == 0) | DECIMAL(38, 18) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
    REAL | FLOAT | | CHAR
    NCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    | STRING | | DATE | DATE | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | ## 源端选项 | 参数名称 | 类型 | 是否必选 | 默认值 | 描述 | |-------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。例如:`jdbc:oracle:thin:datasource01:1523:xe`。 | | username | String | 是 | - | 连接数据库服务器时使用的数据库用户名。 | | password | String | 是 | - | 连接数据库服务器时使用的数据库密码。 | | database-names | List | 否 | - | 要监控的数据库名称。 | | schema-names | List | 否 | - | 要监控的数据库 Schema 名称。 | | table-names | List | 是 | - | 要监控的数据库表名。表名需要包含数据库名,例如:`database_name.table_name` | | table-names-config | List | 否 | - | 表配置列表。例如:`[{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}]` | | startup.mode | Enum | 否 | INITIAL | Oracle CDC 使用者的可选启动模式,有效枚举值为 `initial`、`earliest`、`latest`、`timestamp` 和 `specific`。
    `initial`:启动时同步历史数据,然后同步增量数据。
    `earliest`:从尽可能早的偏移量启动。
    `latest`:从最新的偏移量启动。
    `specific`:从用户提供的特定偏移量启动。 | | startup.timestamp | Long | 否 | - | 从指定的时间戳(自 Unix 纪元以来的毫秒数)启动。当 `startup.mode = timestamp` 时,该时间戳会按 `server-time-zone` 转换。**注意,当 `startup.mode` 选项使用 `timestamp` 时,此选项是必需的。** | | startup.specific-offset.file | String | 否 | - | 从指定的 binlog 文件名启动。**注意,当 `startup.mode` 选项使用 `specific` 时,此选项是必需的。** | | startup.specific-offset.pos | Long | 否 | - | 从指定的 binlog 文件位置启动。**注意,当 `startup.mode` 选项使用 `specific` 时,此选项是必需的。** | | stop.mode | Enum | 否 | NEVER | Oracle CDC 使用者的可选停止模式,有效枚举值为 `never`、`latest` 或 `specific`。
    `never`:实时任务不停止源。
    `latest`:从最新的偏移量停止。
    `specific`:从用户提供的特定偏移量停止。 | | stop.specific-offset.file | String | 否 | - | 从指定的 binlog 文件名停止。**注意,当 `stop.mode` 选项使用 `specific` 时,此选项是必需的。** | | stop.specific-offset.pos | Long | 否 | - | 从指定的 binlog 文件位置停止。**注意,当 `stop.mode` 选项使用 `specific` 时,此选项是必需的。** | | snapshot.split.size | Integer | 否 | 8096 | 表快照的拆分大小(行数),在读取表快照时,捕获的表将被拆分为多个拆分块。 | | snapshot.fetch.size | Integer | 否 | 1024 | 读取表快照时每次轮询的最大获取大小。 | | server-time-zone | String | 否 | UTC | 数据库服务器中的会话时区。如果未设置,则使用 ZoneId.systemDefault() 来确定服务器时区。该参数也用于将 `startup.timestamp` 转换为 SCN。若数据库时区与 JVM 时区不同,建议显式配置。 | | connect.timeout.ms | Duration | 否 | 30000 | 连接器在尝试连接数据库服务器后超时的最大等待时间。 | | connect.max-retries | Integer | 否 | 3 | 连接器尝试建立数据库服务器连接的最大重试次数。 | | connection.pool.size | Integer | 否 | 20 | JDBC 连接池大小。 | | chunk-key.even-distribution.factor.upper-bound | Double | 否 | 100 | 分块键分布因子的上限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则表分块将针对均匀分布进行优化。否则,如果分布因子较大,则表将被视为分布不均,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 100.0。 | | chunk-key.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 分块键分布因子的下限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则表分块将针对均匀分布进行优化。否则,如果分布因子较小,则表将被视为分布不均,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 0.05。 | | sample-sharding.threshold | Integer | 否 | 1000 | 此配置指定触发采样分片策略的预估分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且预估的分片数(计算为近似行数 / 分块大小)超过此阈值时,将使用采样分片策略。这有助于更有效地处理大型数据集。默认值为 1000 个分片。 | | inverse-sampling.rate | Integer | 否 | 1000 | 采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理首选较低采样率的极大型数据集时,它特别有用。默认值为 1000。 | | exactly_once | Boolean | 否 | false | 启用精确一次语义。 | | use_select_count | Boolean | 否 | false | 使用 `select count` 统计表行数,而不是在全量阶段使用其他方法。在这种情况下,当通过分析表使用 SQL 更新统计信息更快时,直接使用 `select count`。 | | skip_analyze | Boolean | 否 | false | 在全量阶段跳过表行数的分析。在这种情况下,您需要定期调度分析表 SQL 以更新相关表统计信息,或者您的表数据更改不频繁。 | | format | Enum | 否 | DEFAULT | Oracle CDC 的可选输出格式,有效枚举值为 `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`。 | | schema-changes.enabled | Boolean | 否 | false | Schema 演进默认禁用。目前我们仅支持 `add column`、`drop column`、`rename column` 和 `modify column`。 | | debezium | Config | 否 | - | 透传 [Debezium 属性](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/oracle.adoc#connector-properties) 给 Debezium Embedded Engine,该引擎用于捕获 Oracle 服务器的数据更改。 | | common-options | | 否 | - | 源端插件常用参数,详情请参阅 [源端常用选项](../common-options/source-common-options.md)。 | | decimal_type_narrowing | Boolean | 否 | true | 数值类型收缩,如果为 true,则在不损失精度的情况下,将 decimal 类型收缩为 int 或 long 类型。目前仅支持 Oracle。请参阅下文的 `decimal_type_narrowing`。 | ### decimal_type_narrowing 数值类型收缩,如果为 true,则在不损失精度的情况下,将 decimal 类型收缩为 int 或 long 类型。目前仅支持 Oracle。 例如: decimal_type_narrowing = true | Oracle | SeaTunnel | |---------------|-----------| | NUMBER(1, 0) | Boolean | | NUMBER(6, 0) | INT | | NUMBER(10, 0) | BIGINT | decimal_type_narrowing = false | Oracle | SeaTunnel | |---------------|----------------| | NUMBER(1, 0) | Decimal(1, 0) | | NUMBER(6, 0) | Decimal(6, 0) | | NUMBER(10, 0) | Decimal(10, 0) | ## 任务示例 ### 简单示例 > 支持多表读取 ```conf source { # 这是一个示例源端插件,**仅用于测试和演示源端插件功能** Oracle-CDC { plugin_output = "customers" username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES", "XE.DEBEZIUM.FULL_TYPES2"] url = "jdbc:oracle:thin:@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` > 在全量阶段使用 select count(*) 代替 analysis table 来统计表行数 ```conf source { # 这是一个示例源端插件,**仅用于测试和演示源端插件功能** Oracle-CDC { plugin_output = "customers" use_select_count = true username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` > 使用 select NUM_ROWS from all_tables 获取表行数,但跳过 analyze table 操作。 ```conf source { # 这是一个示例源端插件,**仅用于测试和演示源端插件功能** Oracle-CDC { plugin_output = "customers" skip_analyze = true username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 } } ``` ### 支持表的自定义主键 ```conf source { Oracle-CDC { plugin_output = "customers" url = "jdbc:oracle:thin:system/oracle@oracle-host:1521:xe" source.reader.close.timeout = 120000 username = "system" password = "oracle" database-names = ["XE"] schema-names = ["DEBEZIUM"] table-names = ["XE.DEBEZIUM.FULL_TYPES"] table-names-config = [ { table = "XE.DEBEZIUM.FULL_TYPES" primaryKeys = ["ID"] } ] } } ``` ### 支持以兼容 debezium 的格式发送到 kafka > 必须与 kafka 连接器 sink 配合使用,详情请参阅 [兼容 debezium 格式](../formats/cdc-compatible-debezium-json.md) ## 更新日志 ================================================ FILE: docs/zh/connectors/source/Oracle.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Oracle > JDBC Oracle 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL 并可以实现投影效果。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | Oracle | 不同的依赖版本有不同的驱动类 | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | ## 数据库依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 > 2. 要支持 i18n 字符集,请将 `orai18n.jar` 复制到 `$SEATUNNEL_HOME/plugins/` 目录。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 > 2. 要支持 i18n 字符集,请将 `orai18n.jar` 复制到 `$SEATUNNEL_HOME/lib/` 目录。 ## 数据类型映射 | Oracle 数据类型 | SeaTunnel 数据类型 | |-----------------|------------------| | INTEGER | DECIMAL(38,0) | | FLOAT | DECIMAL(38, 18) | | NUMBER(precision <= 9, scale == 0) | INT | | NUMBER(9 < precision <= 18, scale == 0) | BIGINT | | NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | | NUMBER(scale != 0) | DECIMAL(38, 18) | | BINARY_DOUBLE | DOUBLE | | BINARY_FLOAT
    REAL | FLOAT | | CHAR
    NCHAR
    VARCHAR
    NVARCHAR2
    VARCHAR2
    LONG
    ROWID
    NCLOB
    CLOB
    XML | STRING | | DATE | TIMESTAMP | | TIMESTAMP
    TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | | BLOB
    RAW
    LONG RAW
    BFILE | BYTES | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:oracle:thin:@datasource01:1523:xe | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,如果您使用 Oracle,值为 `oracle.jdbc.OracleDriver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行性分割的列名,仅支持数值类型,仅支持数值类型主键,只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分割数量,仅支持正整数。默认值是任务并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当 properties 和 URL 具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 Oracle 中,properties 优先于 URL。 | | use_regex | Boolean | 否 | false | 控制 table_path 的正则表达式匹配。设置为 `true` 时,table_path 将被视为正则表达式模式。设置为 `false` 或未指定时,table_path 将被视为精确路径(无正则表达式匹配)。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    "test_schema.table1" | | table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置代替 `table_path`。 | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。 | | split.size | Int | 否 | 8096 | 一个分割中有多少行。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ## 变更日志 ================================================ FILE: docs/zh/connectors/source/OssFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss.md'; # OssFile > Oss文件数据源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 使用依赖 ### 对于Spark/Flink引擎 1. 您必须确保您的spark/flink集群已经集成了hadoop。测试过的hadoop版本是2.x。 2. 您必须确保`hadoop-aliyun-xx.jar`、`aliyun-sdk-oss-xx.jar`和`jdom-xx.jar`在`${SEATUNNEL_HOME}/plugins/`目录中,并且`hadoop-aliyun` jar的版本需要与您在spark/flink中使用的hadoop版本相等,`aliyun-sdk-oss-xx.jar`和`jdom-xx.jar`版本需要是与`hadoop-aliyun`版本对应的版本。例如:`hadoop-aliyun-3.1.4.jar`依赖`aliyun-sdk-oss-3.4.1.jar`和`jdom-1.1.jar`。 ### 对于SeaTunnel Zeta引擎 1. 您必须确保`seatunnel-hadoop3-3.1.4-uber.jar`、`aliyun-sdk-oss-3.4.1.jar`、`hadoop-aliyun-3.1.4.jar`和`jdom-1.1.jar`在`${SEATUNNEL_HOME}/lib/`目录中。 ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在一次pollNext调用中读取分片中的所有数据。将读取的分片保存在快照中。 - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 数据类型映射 数据类型映射与正在读取的文件类型相关,我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `markdown` ### JSON文件类型 如果您将文件类型指定为`json`,您还应该指定schema选项来告诉连接器如何将数据解析为您想要的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您也可以在一个文件中保存多条数据,并用换行符分隔: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` 您应该按如下方式指定schema: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | ### 文本或CSV文件类型 如果您将`file_format_type`设置为`text`、`excel`、`csv`、`xml`。那么需要设置`schema`字段来告诉连接器如何将数据解析为行。 如果您设置了`schema`字段,您还应该设置选项`field_delimiter`,除非`file_format_type`是`csv`、`xml`、`excel` 您可以按如下方式设置schema和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将生成如下数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | ### Orc文件类型 如果您将文件类型指定为`parquet` `orc`,则不需要schema选项,连接器可以自动找到上游数据的schema。 | Orc数据类型 | SeaTunnel数据类型 | |----------------------------------|-------------------------------| | BOOLEAN | BOOLEAN | | INT | INT | | BYTE | BYTE | | SHORT | SHORT | | LONG | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BINARY | BINARY | | STRING
    VARCHAR
    CHAR
    | STRING | | DATE | LOCAL_DATE_TYPE | | TIMESTAMP | LOCAL_DATE_TIME_TYPE | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType,K和V的类型将转换为SeaTunnel类型 | | STRUCT | SeaTunnelRowType | ### Parquet文件类型 如果您将文件类型指定为`parquet` `orc`,则不需要schema选项,连接器可以自动找到上游数据的schema。 | Parquet数据类型 | SeaTunnel数据类型 | |----------------------|-------------------------------| | INT_8 | BYTE | | INT_16 | SHORT | | DATE | DATE | | TIMESTAMP_MILLIS | TIMESTAMP | | INT64 | LONG | | INT96 | TIMESTAMP | | BINARY | BYTES | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOLEAN | BOOLEAN | | FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType,K和V的类型将转换为SeaTunnel类型 | | STRUCT | SeaTunnelRowType | ## 选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |----------------------------|---------|------|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | 需要读取的Oss路径,可以有子路径,但子路径需要满足一定的格式要求。具体要求可以参考"parse_partition_from_path"选项 | | file_format_type | string | 是 | - | 文件类型,支持以下文件类型:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` | | bucket | string | 是 | - | oss文件系统的bucket地址,例如:`oss://seatunnel-test`。 | | endpoint | string | 是 | - | fs oss端点 | | read_columns | list | 否 | - | 数据源的读取列列表,用户可以使用它来实现字段投影。支持列投影的文件类型如下所示:`text` `csv` `parquet` `orc` `json` `excel` `xml`。如果用户想在读取`text` `json` `csv`文件时使用此功能,必须配置"schema"选项。 | | access_key | string | 否 | - | | | access_secret | string | 否 | - | | | delimiter | string | 否 | \001 | 字段分隔符,用于告诉连接器在读取文本文件时如何切分字段。默认`\001`,与hive的默认分隔符相同。 | | row_delimiter | string | 否 | \n | 行分隔符,用于告诉连接器在读取文本文件时如何切分行。默认`\n`。 | | parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径解析分区键和值。例如,如果您从路径`oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`读取文件。文件中的每条记录数据都将添加这两个字段:name="tyrantlucifer",age=16 | | date_format | string | 否 | yyyy-MM-dd | 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`。默认`yyyy-MM-dd` | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | | time_format | string | 否 | HH:mm:ss | 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式:`HH:mm:ss` `HH:mm:ss.SSS` | | filename_extension | string | 否 | - | 过滤文件名扩展名,用于过滤具有特定扩展名的文件。例如:`csv` `.txt` `json` `.xml`。 | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于txt和csv。例如,设置如下:`skip_header_row_number = 2`。然后SeaTunnel将跳过源文件的前2行 | | csv_use_header_line | boolean | 否 | false | 是否使用标题行来解析文件,仅在file_format为`csv`且文件包含符合RFC 4180的标题行时使用 | | schema | config | 否 | - | 上游数据的schema。 | | sheet_name | string | 否 | - | 读取工作簿的工作表,仅在file_format为excel时使用。 | | xml_row_tag | string | 否 | - | 指定XML文件中数据行的标签名称,仅在file_format为xml时使用。 | | xml_use_attr_format | boolean | 否 | - | 指定是否使用标签属性格式处理数据,仅在file_format为xml时使用。 | | compress_codec | string | 否 | none | 文件使用的压缩编解码器。 | | encoding | string | 否 | UTF-8 | | null_format | string | 否 | - | 仅在file_format_type为text时使用。null_format用于定义哪些字符串可以表示为null。例如:`\N` | | binary_chunk_size | int | 否 | 1024 | 仅在file_format_type为binary时使用。读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 | | binary_complete_file_mode | boolean | 否 | false | 仅在file_format_type为binary时使用。是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 | | file_filter_pattern | string | 否 | | 过滤模式,用于过滤文件。 | | common-options | config | 否 | - | 数据源插件通用参数,请参考[数据源通用选项](../common-options/source-common-options.md)了解详情。 | | file_filter_modified_start | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | file_filter_modified_end | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | | metalake_type | string | 否 | gravitino | Metalake 服务类型,目前支持 `gravitino`。 | ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### encoding [string] 仅在file_format_type为json、text、csv、xml时使用。 要读取的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### binary_chunk_size [int] 仅在file_format_type为binary时使用。 读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在file_format_type为binary时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考 https://en.wikipedia.org/wiki/Regular_expression。 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例1**:*匹配所有.txt文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果是: ``` /data/seatunnel/20241001/report.txt ``` **示例2**:*匹配所有以abc开头的文件*,正则表达式: ``` abc.* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例4**:*匹配以202410开头的第三级文件夹和以.csv结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### schema [config] 仅在file_format_type为text、json、excel、xml或csv时需要配置(或其他我们无法从元数据读取schema的格式)。 #### fields [Config] 上游数据的schema。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 ### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ## 如何创建Oss数据同步作业 以下示例演示如何创建从Oss读取数据并在本地客户端打印的数据同步作业: ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建连接到Oss的数据源 source { OssFile { path = "/seatunnel/orc" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" } } # 控制台打印读取的Oss数据 sink { Console { } } ``` ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建连接到Oss的数据源 source { OssFile { path = "/seatunnel/json" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "json" schema { fields { id = int name = string } } } } # 控制台打印读取的Oss数据 sink { Console { } } ``` ### 多表 无需配置schema文件类型,例如:`orc`。 ``` env { parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { OssFile { tables_configs = [ { schema = { table = "fake01" } bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/orc" file_format_type = "orc" }, { schema = { table = "fake02" } bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/orc" file_format_type = "orc" } ] plugin_output = "fake" } } sink { Assert { rules { table-names = ["fake01", "fake02"] } } } ``` 需要配置schema文件类型,例如:`json` ``` env { execution.parallelism = 1 spark.app.name = "SeaTunnel" spark.executor.instances = 2 spark.executor.cores = 1 spark.executor.memory = "1g" spark.master = local job.mode = "BATCH" } source { OssFile { tables_configs = [ { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/json" file_format_type = "json" schema = { table = "fake01" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } }, { bucket = "oss://whale-ops" access_key = "xxxxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxx" endpoint = "https://oss-accelerate.aliyuncs.com" path = "/test/seatunnel/read/json" file_format_type = "json" schema = { table = "fake02" fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } ] plugin_output = "fake" } } sink { Assert { rules { table-names = ["fake01", "fake02"] } } } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { OssFile { path = "/seatunnel/orc" bucket = "oss://tyrantlucifer-image-bed" access_key = "xxxxxxxxxxxxxxxxx" access_secret = "xxxxxxxxxxxxxxxxxxxxxx" endpoint = "oss-cn-beijing.aliyuncs.com" file_format_type = "orc" // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" // 筛选最后修改日期在 20240101 和 20240105 (不包括该日期) 之间的文件 file_filter_modified_start = "2024-01-01 00:00:00" file_filter_modified_end = "2024-01-05 00:00:00" } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/OssJindoFile.md ================================================ import ChangeLog from '../changelog/connector-file-oss-jindo.md'; # OssJindoFile > OssJindo 文件源连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [多模态](../../introduction/concepts/connector-v2-features.md#multimodal) 使用二进制文件格式读写任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在一次 pollNext 调用中读取分割中的所有数据。读取哪些分割将保存在快照中。 - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 使用 Jindo API 从阿里云 OSS 文件系统读取数据。 :::tip 您需要下载 [jindosdk-4.6.1.tar.gz](https://jindodata-binary.oss-cn-shanghai.aliyuncs.com/release/4.6.1/jindosdk-4.6.1.tar.gz) 然后解压缩,从 lib 中复制 jindo-sdk-4.6.1.jar 和 jindo-core-4.6.1.jar 到 ${SEATUNNEL_HOME}/lib。 如果您使用 spark/flink,为了使用此连接器,您必须确保您的 spark/flink 集群已集成 hadoop。测试的 hadoop 版本是 2.x。 如果您使用 SeaTunnel 引擎,它会在您下载和安装 SeaTunnel 引擎时自动集成 hadoop jar。您可以检查 ${SEATUNNEL_HOME}/lib 下的 jar 包来确认这一点。 我们为了支持更多文件类型做了一些权衡,所以我们使用 HDFS 协议来内部访问 OSS,此连接器需要一些 hadoop 依赖项。 它仅支持 hadoop 版本 **2.9.X+**。 ::: ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |---------------------------|---------|----|-----------------------------|-------------------------------------------------------------------------------| | path | string | 是 | - | 目标目录路径 | | file_format_type | string | 是 | - | 文件类型 | | bucket | string | 是 | - | OSS 文件系统的桶地址 | | access_key | string | 是 | - | OSS 文件系统的访问密钥 | | access_secret | string | 是 | - | OSS 文件系统的访问密钥 | | endpoint | string | 是 | - | OSS 文件系统的端点 | | read_columns | list | 否 | - | 数据源的读取列列表 | | delimiter/field_delimiter | string | 否 | \001 for text and , for csv | 字段分隔符 | | row_delimiter | string | 否 | \n | 行分隔符 | | parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径解析分区键和值 | | date_format | string | 否 | yyyy-MM-dd | 日期类型格式 | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式 | | time_format | string | 否 | HH:mm:ss | 时间类型格式 | | skip_header_row_number | long | 否 | 0 | 跳过前几行 | | schema | config | 否 | - | 上游数据的模式信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | sheet_name | string | 否 | - | Excel 工作表名称 | | xml_row_tag | string | 否 | - | XML 行标签 | | xml_use_attr_format | boolean | 否 | - | 是否使用 XML 属性格式 | | csv_use_header_line | boolean | 否 | false | 是否使用 CSV 标题行 | | file_filter_pattern | string | 否 | - | 文件过滤模式 | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Paimon.md ================================================ import ChangeLog from '../changelog/connector-paimon.md'; # Paimon > Paimon 源连接器 ## 描述 用于从 `Apache Paimon` 读取数据 ### SeaTunnel与Paimon版本对照 | Seatunnel Version | Paimon Version | |-------------------|------------------| | 2.3.2 - 2.3.3 | 0.4-SNAPSHOT | | 2.3.4 | 0.6-SNAPSHOT | | 2.3.5 - 2.3.11 | 0.7.0-incubating | | 2.3.12 - 2.3.13 | 1.1.1 | ### 从 0.7 版本升级到 1.1.1 版本的注意事项 1. **备份建议** 尽管存在兼容性保障,但在从 0.7 版本开始升级前,仍强烈建议备份关键数据,尤其是元数据目录。 2. **逐步升级流程** - **测试环境验证**:首先在测试环境中验证(从 0.7 版本开始的)升级过程。 - **更新 JAR 文件**:将 Paimon 的 JAR 文件替换为 1.1.1 版本。 - **自动格式升级**:系统会自动识别并升级 0.7 版本中使用的文件格式。 3. **配置检查** 检查配置以确认是否存在 0.7 版本适用的已弃用选项。尽管大多数配置保持向后兼容,但已弃用的设置可能需要更新以适配 1.1.1 版本。 4. **升级后验证** 从 0.7 版本升级到 1.1.1 版本后,需验证以下内容: - **读写操作**:确保基于 0.7 版本继承的数据结构,数据写入和读取流程正常运行。 - **查询性能**:考虑到 0.7 与 1.1.1 版本间底层机制(如分桶管理)的变化,确认查询响应时间符合预期。 - **新功能验证**:测试所有新增功能(如增强的压实机制、时间旅行等),确保其与从 0.7 版本迁移的数据兼容并正常工作。 **注意**:遵循这些步骤有助于降低风险,确保从 0.7 版本平稳过渡到稳定版本 1.1.1。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | |-------------------------|----------|--------|---------------| | warehouse | String | 是 | - | | catalog_type | String | 否 | filesystem | | catalog_uri | String | 否 | - | | database | String | 是 | - | | table | String | 否 | - | | table_list | array | 否 | - | | user | String | 否 | - | | password | String | 否 | - | | hdfs_site_path | String | 否 | - | | query | String | 否 | - | | paimon.hadoop.conf | Map | 否 | - | | paimon.hadoop.conf-path | String | 否 | - | ### warehouse [string] Paimon warehouse 路径 ### catalog_type [string] Paimon Catalog 类型,支持 filesystem 和 hive ### catalog_uri [string] Paimon 的 catalog uri,仅当 catalog_type 为 hive 时需要 ### database [string] 需要访问的数据库 ### table [string] 需要访问的表 ### table_list [array] `Paimon` 表名列表,当需要同时读取多表时使用此配置代替 table ### hdfs_site_path [string] `hdfs-site.xml` 文件地址 ### query [string] 读取表格的筛选条件,例如:`select * from st_test where id > 100`。如果未指定,则将读取所有记录。 目前,`where` 支持`<, <=, >, >=, =, !=, or, and,is null, is not null, between...and, in , not in, like`,其他暂不支持。 Projection 已支持,你可以选择特定的列,例如:select id, name from st_test where id > 100。 由于 Paimon 限制,目前不支持 `Having`, `Group By` 和 `Order By`。 query 参数支持动态参数设置: ```sql SELECT * FROM table /*+ OPTIONS('incremental-between' = 'test-tag1,test-tag2') */; ``` 注意:当 `where` 后的字段为字符串或布尔值时,其值必须使用单引号,否则将会报错。例如 `name='abc'` 或 `tag='true'`。 当前 `where` 支持的字段数据类型如下: * string * boolean * tinyint * smallint * int * bigint * float * double * date * timestamp * time ### paimon.hadoop.conf [string] hadoop conf 属性 ### paimon.hadoop.conf-path [string] 指定 'core-site.xml', 'hdfs-site.xml', 'hive-site.xml' 文件加载路径。 ## Filesystems Paimon 连接器支持向多个文件系统写入数据。目前,支持的文件系统有 `hdfs` 和 `s3`。 如果使用 `s3` 文件系统,可以在 `paimon.hadoop.conf` 中配置`fs.s3a.access-key`、`fs.s3a.secret-key`、`fs.s3a.endpoint`、`fs.s3a.path.style.access`、`fs.s3a.aws.credentials.provider` 属性,数仓地址应该以 `s3a://` 开头。 ## 示例 ### 简单示例 ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table = "st_test" } } ``` ### 读取多表 ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table_list = [ { table = "table1" query = "select * from table1 where id > 100" }, { table = "table2" query = "select * from table2 where id > 100" } ] } } ``` ### Filter 示例 ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test" query = "select c_boolean, c_tinyint from st_test where c_boolean= 'true' and c_tinyint > 116 and c_smallint = 15987 or c_decimal='2924137191386439303744.39292213'" } } ``` ### S3 示例 ```hocon env { execution.parallelism = 1 job.mode = "BATCH" } source { Paimon { warehouse = "s3a://test/" database = "seatunnel_namespace11" table = "st_test" paimon.hadoop.conf = { fs.s3a.access-key=G52pnxg67819khOZ9ezX fs.s3a.secret-key=SHJuAQqHsLrgZWikvMa3lJf5T0NfM5LMFliJh9HF fs.s3a.endpoint="http://minio4:9000" fs.s3a.path.style.access=true fs.s3a.aws.credentials.provider=org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider } } } sink { Console{} } ``` ### Hadoop 配置示例 ```hocon source { Paimon { catalog_name="seatunnel_test" warehouse="hdfs:///tmp/paimon" database="seatunnel_namespace1" table="st_test" query = "select * from st_test where pk_id is not null and pk_id < 3" paimon.hadoop.conf = { hadoop_user_name = "hdfs" fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ### Hive catalog 示例 ```hocon source { Paimon { catalog_name="seatunnel_test" catalog_type="hive" catalog_uri="thrift://hadoop04:9083" warehouse="hdfs:///tmp/seatunnel" database="seatunnel_test" table="st_test3" paimon.hadoop.conf = { fs.defaultFS = "hdfs://nameservice1" dfs.nameservices = "nameservice1" dfs.ha.namenodes.nameservice1 = "nn1,nn2" dfs.namenode.rpc-address.nameservice1.nn1 = "hadoop03:8020" dfs.namenode.rpc-address.nameservice1.nn2 = "hadoop04:8020" dfs.client.failover.proxy.provider.nameservice1 = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" dfs.client.use.datanode.hostname = "true" } } } ``` ### paimon开启权限示例 ```hocon source { Paimon { warehouse = "/tmp/paimon" database = "default" table = "st_test" user = "paimon" password = "******" } } ``` ## Changelog 如果要读取 paimon 表的 changelog,首先要为 Paimon 源表设置 `changelog-producer`,然后使用 SeaTunnel 流任务读取。 ### Note 目前,批读取总是读取最新的快照,如需读取更完整的 changelog 数据,需使用流读取,并在将数据写入 Paimon 表之前开始流读取,为了确保顺序,流读取任务并行度应该设置为 1。 ### Streaming read 示例 ```hocon env { parallelism = 1 job.mode = "Streaming" } source { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test" } } sink { Paimon { warehouse = "/tmp/paimon" database = "full_type" table = "st_test_sink" paimon.table.primary-keys = "c_tinyint" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Persistiq.md ================================================ import ChangeLog from '../changelog/connector-http-persistiq.md'; # Persistiq > Persistiq 源连接器 ## 描述 用于从 Persistiq 读取数据。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [模式投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-----------------------------|---------|----|-------|---------------------------------------------------------------------------------------------| | url | String | 是 | - | HTTP 请求 URL | | password | String | 是 | - | API 密钥用于登录 | | method | String | 否 | get | HTTP 请求方法,仅支持 GET、POST 方法 | | schema | Config | 否 | - | HTTP 和 SeaTunnel 数据结构映射。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | schema.fields | Config | 否 | - | 上游数据的模式字段 | | format | String | 否 | json | 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 | | params | Map | 否 | - | HTTP 参数 | | body | String | 否 | - | HTTP 请求体 | | json_field | Config | 否 | - | JSON 字段配置 | | content_json | String | 否 | - | 内容 JSON 配置 | | poll_interval_millis | int | 否 | - | 流模式下请求 HTTP API 的间隔(毫秒) | | retry | int | 否 | - | 如果 HTTP 请求返回 `IOException` 的最大重试次数 | | retry_backoff_multiplier_ms | int | 否 | 100 | HTTP 请求失败时的重试退避倍数(毫秒) | | retry_backoff_max_ms | int | 否 | 10000 | HTTP 请求失败时的最大重试退避时间(毫秒) | | enable_multi_lines | boolean | 否 | false | 是否启用多行模式 | | common-options | config | 否 | - | 源插件通用参数 | ### url [String] HTTP 请求 URL ### password [String] API 密钥用于登录,您可以在 Persistiq 网站获取 ### method [String] HTTP 请求方法,仅支持 GET、POST 方法 ### params [Map] HTTP 参数 ### body [String] HTTP 请求体 ### poll_interval_millis [int] 流模式下请求 HTTP API 的间隔(毫秒) ### retry [int] 如果 HTTP 请求返回 `IOException` 的最大重试次数 ### retry_backoff_multiplier_ms [int] HTTP 请求失败时的重试退避倍数(毫秒) ### retry_backoff_max_ms [int] HTTP 请求失败时的最大重试退避时间(毫秒) ### format [String] 上游数据的格式,现在仅支持 `json` `text`,默认 `json`。 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### content_json [String] 此参数可以获取一些 JSON 数据。 ### json_field [Config] 此参数帮助您配置模式,因此此参数必须与 schema 一起使用。 ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 示例 ```hocon source { Persistiq{ url = "https://api.persistiq.com/v1/users" password = "Your password" content_field = "$.users.*" schema = { fields { id = string name = string email = string activated = boolean default_mailbox_id = string salesforce_id = string } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Phoenix.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Phoenix > Phoenix 源连接器 ## 描述 通过[Jdbc连接器] (Jdbc.md) 读取Phoenix数据. 支持批处理模式和流模式。测试的Phoenix版本是4.xx和5.xx 在底层实现上,通过Phoenix的jdbc驱动程序,执行upstart语句将数据写入HBase. 用Java JDBC连接Phoenix的两种方法。一种是通过JDBC连接到zookeeper,另一种是使用JDBC thin 户端连接到 queryserver. > 提示:默认情况下,使用(thin)驱动程序jar。如果要使用(thick)驱动程序或Phoenix(thin)驱动程序的其他版本,则需要重新编译jdbc连接器模块 ## 关键特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) 支持查询SQL,可以实现投影效果. - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 选项 ### driver [string] 如果使用phoenix(thick)驱动程序,则值为`org.apache.phoenix.jdbc.PhoenixDriver` 或您使用的(thin)驱动程序的值是 `org.apache.phoenix.queryserver.client.Driver` ### url [string] 如果您使用phoenix(thick)驱动程序,则值为 `jdbc:phoenix:localhost:2182/hbase` ,或者您使用(thin)驱动程序时,值为 `jdbc:phoenix:thin:url=http://localhost:8765;serialization=PROTOBUF` ### common options 源插件常用参数,详见 [Source Common Options](../common-options/source-common-options.md) ## 示例 使用 thick 客户端驱动器 ``` Jdbc { driver = org.apache.phoenix.jdbc.PhoenixDriver url = "jdbc:phoenix:localhost:2182/hbase" query = "select age, name from test.source" } ``` 使用 thin 客户端驱动器 ``` Jdbc { driver = org.apache.phoenix.queryserver.client.Driver url = "jdbc:phoenix:thin:url=http://spark_e2e_phoenix_sink:8765;serialization=PROTOBUF" query = "select age, name from test.source" } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/PostgreSQL-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-postgres.md'; # PostgreSQL CDC > PostgreSQL CDC 源连接器 ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    ## 主要特性 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 描述 Postgre CDC 连接器允许从 Postgre 数据库读取快照数据和增量数据。本文件描述了如何设置 Postgre CDC 连接器,以便对 Postgre 数据库执行 SQL 查询。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | Url | Maven | |------------|-----------------------------------------------------|---------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | 如果您想在 PostgreSQL 中操作 GEOMETRY/GEOGRAPHY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## 使用依赖 ### 安装 Jdbc 驱动 #### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 #### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 请下载并将 PostgreSQL 驱动放入 `${SEATUNNEL_HOME}/lib/` 目录。例如:cp postgresql-xxx.jar `$SEATUNNEL_HOME/lib/` > 以下是启用 PostgreSQL 中的 CDC(变化数据捕获)的步骤: 1. 确保 wal_level 设置为 logical:通过在 postgresql.conf 配置文件中添加 "wal_level = logical" 来修改,重启 PostgreSQL 服务器以使更改生效。 或者,您可以使用 SQL 命令直接修改配置: ```sql ALTER SYSTEM SET wal_level TO 'logical'; SELECT pg_reload_conf(); ``` 2. 将指定表的 REPLICA 策略更改为 FULL ```sql ALTER TABLE your_table_name REPLICA IDENTITY FULL; ``` ## 数据类型映射 | PostgreSQL 数据类型 | SeaTunnel 数据类型 | |-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | _BOOL
    | ARRAY<BOOLEAN> | | BYTEA
    | BYTES | | _BYTEA
    | ARRAY<TINYINT> | | INT2
    SMALLSERIAL
    INT4
    SERIAL
    | INT | | _INT2
    _INT4
    | ARRAY<INT> | | INT8
    BIGSERIAL
    | BIGINT | | _INT8
    | ARRAY<BIGINT> | | FLOAT4
    | FLOAT | | _FLOAT4
    | ARRAY<FLOAT> | | FLOAT8
    | DOUBLE | | _FLOAT8
    | ARRAY<DOUBLE> | | NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小, 获取指定列小数点右侧的位数) | | NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB | STRING | | _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | | TIMESTAMP
    | TIMESTAMP | | TIME
    | TIME | | DATE
    | DATE | | 其他数据类型 | 尚不支持 | ## 源选项 | 名称 | 类型 | 必需 | 默认 | 描述 | |-------------------------------------------|----------|------|----------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考案例:`jdbc:postgresql://localhost:5432/postgres_cdc?loggerLevel=OFF`。 | | username | String | 是 | - | 连接到数据库服务器时使用的数据库名称。 | | password | String | 是 | - | 连接到数据库服务器时使用的密码。 | | database-names | List | 否 | - | 需要监控的数据库名称。 | | table-names | List | 是 | - | 需要监控的数据库表名称。表名称需要包含数据库名称,例如:`database_name.table_name`。 | | table-names-config | List | 否 | - | 表配置列表。例如: [{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | startup.mode | List | 否 | INITIAL | PostgreSQL CDC 消费者的可选启动模式,有效枚举为 `initial`、`earliest` 和 `latest`。
    `initial`: 启动时同步历史数据,然后同步增量数据。
    `earliest`: 从可能的最早偏移量启动。
    `latest`: 从最新偏移量启动。 | | snapshot.split.size | Integer | 否 | 8096 | 表快照的拆分大小(行数),捕获的表在读取表快照时被拆分成多个拆分。 | | snapshot.fetch.size | Integer | 否 | 1024 | 读取表快照时每次轮询的最大获取大小。 | | slot.name | String | 否 | - | 为特定数据库/模式创建的用于流式传输更改的 PostgreSQL 逻辑解码槽的名称。服务器使用此槽将事件流式传输到您正在配置的连接器。默认值为 seatunnel。 | | decoding.plugin.name | String | 否 | pgoutput | 安装在服务器上的 Postgres 逻辑解码插件的名称,支持的值有 decoderbufs、wal2json、wal2json_rds、wal2json_streaming、wal2json_rds_streaming 和 pgoutput。 | | server-time-zone | String | 否 | UTC | 数据库服务器中的会话时区。如果未设置,则使用 ZoneId.systemDefault() 来确定服务器时区。 | | connect.timeout.ms | Duration | 否 | 30000 | 连接器在尝试连接到数据库服务器后应等待的最大时间,以防超时。 | | connect.max-retries | Integer | 否 | 3 | 连接器应重试建立数据库服务器连接的最大重试次数。 | | connection.pool.size | Integer | 否 | 20 | JDBC 连接池大小。 | | chunk-key.even-distribution.factor.upper-bound | Double | 否 | 100 | 块键分布因子的上限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则将优化表块以实现均匀分布。否则,如果分布因子更大,则将认为该表分布不均匀,并且如果估计的分片数量超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 100.0。 | | chunk-key.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 块键分布因子的下限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则将优化表块以实现均匀分布。否则,如果分布因子更小,则将认为该表分布不均匀,并且如果估计的分片数量超过 `sample-sharding.threshold` 指定的值,则将使用基于采样的分片策略。默认值为 0.05。 | | sample-sharding.threshold | Integer | 否 | 1000 | 此配置指定触发采样分片策略的估计分片数量阈值。当分布因子超出由 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,且估计的分片数量(计算为近似行数 / 块大小)超过此阈值时,将使用采样分片策略。这可以帮助更有效地处理大数据集。默认值为 1000 个分片。 | | inverse-sampling.rate | Integer | 否 | 1000 | 在采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大数据集时,较低的采样率尤为有用。默认值为 1000。 | | exactly_once | Boolean | 否 | false | 启用精确一次语义。 | | format | Enum | 否 | DEFAULT | PostgreSQL CDC 的可选输出格式,有效枚举为 `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`。 | | debezium | Config | 否 | - | 将 [Debezium 的属性](https://github.com/debezium/debezium/blob/v1.9.8.Final/documentation/modules/ROOT/pages/connectors/postgresql.adoc#connector-configuration-properties) 传递给用于捕获 PostgreSQL 服务器数据更改的 Debezium 嵌入式引擎。 | | common-options | | 否 | - | 源插件的公共参数,请参阅 [源公共选项](../common-options/source-common-options.md) 获取详细信息。 | ## 任务示例 ### 简单 > 支持多表读取 ``` env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { Postgres-CDC { plugin_output = "customers_Postgre_cdc" username = "postgres" password = "postgres" database-names = ["postgres_cdc"] schema-names = ["inventory"] table-names = ["postgres_cdc.inventory.postgres_cdc_table_1,postgres_cdc.inventory.postgres_cdc_table_2"] url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" } } transform { } sink { jdbc { plugin_input = "customers_Postgre_cdc" url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" driver = "org.postgresql.Driver" username = "postgres" password = "postgres" generate_sink_sql = true # You need to configure both database and table database = postgres_cdc schema = "inventory" tablePrefix = "sink_" primary_keys = ["id"] } } ``` ### 支持自定义表的主键 ``` source { Postgres-CDC { plugin_output = "customers_mysql_cdc" username = "postgres" password = "postgres" database-names = ["postgres_cdc"] schema-names = ["inventory"] table-names = ["postgres_cdc.inventory.full_types_no_primary_key"] url = "jdbc:postgresql://postgres_cdc_e2e:5432/postgres_cdc?loggerLevel=OFF" decoding.plugin.name = "decoderbufs" exactly_once = false table-names-config = [ { table = "postgres_cdc.inventory.full_types_no_primary_key" primaryKeys = ["id"] } ] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/PostgreSQL.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # PostgreSQL > JDBC PostgreSQL 源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动的jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/org.postgresql/postgresql) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [严格一次性](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL,并可以实现投影效果。 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | URL | Maven | |----------------|----------------------------------------------------|---------------------|---------------------------------------|--------------------------------------------------------------------------| | PostgreSQL | 不同的依赖版本有不同的驱动类。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/org.postgresql/postgresql) | | PostgreSQL | 如果您想在 PostgreSQL 中操作 GEOMETRY 类型。 | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [下载](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | ## 数据库依赖 > 请下载与 'Maven' 对应的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录中
    > 例如,对于 PostgreSQL 数据源: cp postgresql-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/
    > 如果您想在 PostgreSQL 中操作 GEOMETRY 类型,请将 postgresql-xxx.jar 和 postgis-jdbc-xxx.jar 添加到 $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | PostgreSQL 数据类型 | SeaTunnel 数据类型 | |--------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| | BOOL
    | BOOLEAN | | _BOOL
    | ARRAY<BOOLEAN> | | BYTEA
    | BYTES | | _BYTEA
    | ARRAY<TINYINT> | | INT2
    SMALLSERIAL | SMALLINT | | _INT2 | ARRAY<SMALLINT> | | INT4
    SERIAL
    | INT | | _INT4
    | ARRAY<INT> | | INT8
    BIGSERIAL
    | BIGINT | | _INT8
    | ARRAY<BIGINT> | | FLOAT4
    | FLOAT | | _FLOAT4
    | ARRAY<FLOAT> | | FLOAT8
    | DOUBLE | | _FLOAT8
    | ARRAY<DOUBLE> | | NUMERIC(指定列的列大小>0) | DECIMAL(指定列的列大小,获取指定列小数点右侧的数字位数) | | NUMERIC(指定列的列大小<0) | DECIMAL(38, 18) | | BPCHAR
    CHARACTER
    VARCHAR
    TEXT
    GEOMETRY
    GEOGRAPHY
    JSON
    JSONB
    UUID | STRING | | _BPCHAR
    _CHARACTER
    _VARCHAR
    _TEXT | ARRAY<STRING> | | TIMESTAMP(s)
    TIMESTAMPTZ(s) | TIMESTAMP(s) | | TIME(s)
    TIMETZ(s) | TIME(s) | | DATE
    | DATE | ## 选项 | 名称 | 类型 | 必需 | 默认 | 描述 | |--------------------------------------------|------------|------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:postgresql://localhost:5432/test | | driver | String | 是 | - | 用于连接到远程数据源的 JDBC 类名,
    如果您使用 MySQL,则值为 `com.mysql.cj.jdbc.Driver`。 | | username | String | 否 | - | 连接实例的用户名 | | password | String | 否 | - | 连接实例的密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 用于验证连接的数据库操作完成的等待时间(秒) | | partition_column | String | 否 | - | 用于并行化的分区列名,仅支持数字类型,
    仅支持数字类型主键,并且只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | 扫描的 partition_column 的最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | 扫描的 partition_column 的最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | 作业并行性 | 分区数量,仅支持正整数。默认值为作业并行性 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置
    用于查询的行抓取大小,以通过减少所需的数据库访问次数来提高性能。
    0 表示使用 JDBC 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当属性和 URL 具有相同参数时,
    优先级由驱动程序的具体实现决定。在 MySQL 中,属性优先于 URL。 | | use_regex | Boolean | 否 | false | 控制表路径的正则表达式匹配。当设置为true时,table_path 将被视为正则表达式模式。当设置为false或未指定时,table_path 将被视为精确路径(不进行正则匹配)。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置替代 `query`。
    示例:
    "testdb.test_schema.table1" | | table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置替代 `table_path` 示例:```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。 例如 `where id > 100` | | split.size | Int | 否 | 8096 | 表的拆分大小(行数),被捕获的表在读取时被拆分为多个拆分。 | | split.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 块键分布因子的下限。此因子用于确定表数据是否均匀分布。
    如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较小,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 0.05。 | | split.even-distribution.factor.upper-bound | Double | 否 | 100 | 块键分布因子的上限。此因子用于确定表数据是否均匀分布。
    如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较大,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 100.0。 | | split.sample-sharding.threshold | Int | 否 | 10000 | 此配置指定触发样本分片策略的估计分片数阈值。
    当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围时,且估计的分片数(计算为近似行数 / 块大小)超过此阈值,将使用样本分片策略。这可以帮助更高效地处理大数据集。默认值为 1000 个分片。 | | split.inverse-sampling.rate | Int | 否 | 1000 | 在样本分片策略中使用的采样率的逆数。例如,如果此值设置为 1000,表示在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大的数据集时,较低的采样率尤其有用。默认值为 1000。 | | ## 并行读取器 JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用某些规则来拆分表中的数据,这些数据将交给读取器进行读取。读取器的数量由 `parallelism` 选项确定。 **拆分键规则:** 1. 如果 `partition_column` 不为 null,将用于计算拆分。该列必须属于 **支持的拆分数据类型**。 2. 如果 `partition_column` 为 null,SeaTunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多列,则使用第一个属于 **支持的拆分数据类型** 的列来拆分数据。例如,表有主键(nn guid, name varchar),因为 `guid` 不在 **支持的拆分数据类型** 中,因此将使用列 `name` 来拆分数据。 **支持的拆分数据类型:** * 字符串 * 数字(int, bigint, decimal, ...) * 日期 ### 与拆分相关的选项 #### split.size 每个拆分中有多少行,当读取表时,被捕获的表将拆分为多个拆分。 #### split.even-distribution.factor.lower-bound > 不推荐使用 块键分布因子的下限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子大于或等于此下限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较小,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 0.05。 #### split.even-distribution.factor.upper-bound > 不推荐使用 块键分布因子的上限。此因子用于确定表数据是否均匀分布。如果计算出的分布因子小于或等于此上限(即 (MAX(id) - MIN(id) + 1) / 行数),则表块将优化为均匀分布。否则,如果分布因子较大,则将视为不均匀分布,当估计的分片数超过 `sample-sharding.threshold` 指定的值时,将使用基于采样的分片策略。默认值为 100.0。 #### split.sample-sharding.threshold 此配置指定触发样本分片策略的估计分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围时,且估计的分片数(计算为近似行数 / 块大小)超过此阈值,将使用样本分片策略。这可以帮助更高效地处理大数据集。默认值为 1000 个分片。 #### split.inverse-sampling.rate 在样本分片策略中使用的采样率的逆数。例如,如果此值设置为 1000,表示在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。在处理非常大的数据集时,较低的采样率尤其有用。默认值为 1000。 #### partition_column [字符串] 用于拆分数据的列名。 #### partition_upper_bound [BigDecimal] 扫描的 partition_column 最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 #### partition_lower_bound [BigDecimal] 扫描的 partition_column 最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 #### partition_num [整数] > 不推荐使用,正确的方法是通过 `split.size` 控制拆分数量 我们需要拆分成多少个拆分,仅支持正整数。默认值为作业并行性。 ## 提示 > 如果表无法拆分(例如,表没有主键或唯一索引,并且未设置 `partition_column`),将以单一并发运行。 > > 使用 `table_path` 替代 `query` 进行单表读取。如果需要读取多个表,请使用 `table_list`。 ## 任务示例 ### 简单示例 > 此示例查询您测试 "database" 中 type_bin 为 'table' 的 16 条数据,并以单并行方式查询其所有字段。您还可以指定要查询的字段,以便最终输出到控制台。 ``` # Defining the runtime environment env { parallelism = 4 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = "root" password = "test" query = "select * from source limit 16" } } transform { # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 按 partition_column 并行读取 > 使用您配置的分片字段和分片数据并行读取查询表。如果您想要读取整个表,可以这样做。 ``` env { parallelism = 4 job.mode = "BATCH" } source{ jdbc{ url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = "root" password = "test" query = "select * from source" partition_column= "id" partition_num = 5 } } sink { Console {} } ``` ### 按主键或唯一索引并行读取 > 配置 `table_path` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略。 ``` env { parallelism = 4 job.mode = "BATCH" } source { Jdbc { url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" table_path = "test.public.AllDataType_1" query = "select * from public.AllDataType_1" split.size = 10000 } } sink { Console {} } ``` ### 并行的同时指定边界 > 在查询中指定上下边界内的数据更为高效。根据您配置的上下边界读取数据源将更为高效。 ``` source{ jdbc{ url = "jdbc:postgresql://localhost:5432/test" driver = "org.postgresql.Driver" username = "root" password = "test" query = "select * from source" partition_column= "id" # The name of the table returned plugin_output = "jdbc" partition_lower_bound = 1 partition_upper_bound = 50 partition_num = 5 } } ``` ### 多表读取 ***配置 `table_list` 将启用自动拆分,您可以配置 `split.*` 来调整拆分策略*** ```hocon env { job.mode = "BATCH" parallelism = 4 } source { Jdbc { url="jdbc:postgresql://datasource01:5432/demo" username="iDm82k6Q0Tq+wUprWnPsLQ==" driver="org.postgresql.Driver" password="iDm82k6Q0Tq+wUprWnPsLQ==" "table_list"=[ { "table_path"="demo.public.AllDataType_1" }, { "table_path"="demo.public.alldatatype" } ] #where_condition= "where id > 100" split.size = 10000 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Prometheus.md ================================================ import ChangeLog from '../changelog/connector-prometheus.md'; # Prometheus > Prometheus 数据源连接器 ## 描述 用于读取prometheus数据。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [并行](../../introduction/concepts/connector-v2-features.md) ## 源选项 | 名称 | 类型 | 是否必填 | 默认值 | |-----------------------------|---------|------|-----------------| | url | String | Yes | - | | query | String | Yes | - | | query_type | String | Yes | Instant | | content_field | String | Yes | $.data.result.* | | schema.fields | Config | Yes | - | | format | String | No | json | | params | Map | Yes | - | | poll_interval_millis | int | No | - | | retry | int | No | - | | retry_backoff_multiplier_ms | int | No | 100 | | retry_backoff_max_ms | int | No | 10000 | | enable_multi_lines | boolean | No | false | | common-options | config | No | | ### url [String] http 请求路径。 ### query [String] Prometheus 表达式查询字符串 ### query_type [String] Instant/Range 1. Instant : 简单指标的即时查询。 2. Range : 一段时间内指标数据。 https://prometheus.io/docs/prometheus/latest/querying/api/ ### params [Map] http 请求参数 ### poll_interval_millis [int] 流模式下请求HTTP API间隔(毫秒) ### retry [int] The max retry times if request http return to `IOException` ### retry_backoff_multiplier_ms [int] 请求http返回到' IOException '的最大重试次数 ### retry_backoff_max_ms [int] http请求失败,最大重试回退时间(毫秒) ### format [String] 上游数据的格式,默认为json。 ### schema [Config] 按照如下填写一个固定值 ```hocon schema = { fields { metric = "map" value = double time = long } } ``` #### fields [Config] 上游数据的模式字段 ### common options 源插件常用参数,请参考[Source Common Options](../common-options/source-common-options.md) 了解详细信息 ## 示例 ### Instant ```hocon source { Prometheus { plugin_output = "http" url = "http://mockserver:1080" query = "up" query_type = "Instant" content_field = "$.data.result.*" format = "json" schema = { fields { metric = "map" value = double time = long } } } } ``` ### Range ```hocon source { Prometheus { plugin_output = "http" url = "http://mockserver:1080" query = "up" query_type = "Range" content_field = "$.data.result.*" format = "json" start = "2024-07-22T20:10:30.781Z" end = "2024-07-22T20:11:00.781Z" step = "15s" schema = { fields { metric = "map" value = double time = long } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Pulsar.md ================================================ import ChangeLog from '../changelog/connector-pulsar.md'; # Apache Pulsar > Apache Pulsar 源连接器 ## 描述 Apache Pulsar 的源连接器。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------------------------|---------|----|--------|--------------------------------------------------------------------------------------| | topic | String | 否 | - | 主题名称 | | topic-pattern | String | 否 | - | 主题名称的正则表达式模式 | | topic-discovery.interval | Long | 否 | -1 | 发现新主题分区的间隔(毫秒) | | subscription.name | String | 是 | - | 订阅名称 | | client.service-url | String | 是 | - | Pulsar 服务 URL | | admin.service-url | String | 是 | - | Pulsar 管理端点的 HTTP URL | | auth.plugin-class | String | 否 | - | 认证插件的名称 | | auth.params | String | 否 | - | 认证插件的参数 | | poll.timeout | Integer | 否 | 100 | 获取记录时的最大等待时间(毫秒) | | poll.interval | Long | 否 | 50 | 获取记录时的间隔时间(毫秒) | | poll.batch.size | Integer | 否 | 500 | 轮询时要获取的最大记录数 | | cursor.startup.mode | Enum | 否 | LATEST | 启动模式 | | cursor.startup.timestamp | Long | 否 | - | 启动时间戳(毫秒) | | cursor.reset.mode | Enum | 否 | LATEST | 游标重置策略 | | cursor.stop.mode | Enum | 否 | NEVER | 停止模式 | | cursor.stop.timestamp | Long | 否 | - | 停止时间戳(毫秒) | | schema | config | 否 | - | 数据结构,包括字段名称和字段类型。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | common-options | | 否 | - | 源插件通用参数 | | format | String | 否 | json | 数据格式 | ### topic [String] 当表用作源时要读取数据的主题名称。它也支持通过分号分隔的主题列表,如 'topic-1;topic-2'。 **注意,只能为源指定 "topic-pattern" 和 "topic" 中的一个。** ### topic-pattern [String] 主题名称模式的正则表达式。当作业开始运行时,所有名称与指定正则表达式匹配的主题都将被消费者订阅。 **注意,只能为源指定 "topic-pattern" 和 "topic" 中的一个。** ### topic-discovery.interval [Long] Pulsar 源发现新主题分区的间隔(毫秒)。非正值禁用主题分区发现。 **注意,此选项仅在使用 'topic-pattern' 选项时有效。** ### subscription.name [String] 为此消费者指定订阅名称。构造消费者时需要此参数。 ### client.service-url [String] Pulsar 服务的服务 URL 提供程序。要使用客户端库连接到 Pulsar,需要指定 Pulsar 协议 URL。 例如,`localhost`: `pulsar://localhost:6650,localhost:6651`。 ### admin.service-url [String] Pulsar 服务管理端点的 HTTP URL。 例如,`http://my-broker.example.com:8080`,或 `https://my-broker.example.com:8443`(用于 TLS)。 ### auth.plugin-class [String] 认证插件的名称。 ### auth.params [String] 认证插件的参数。 例如,`key1:val1,key2:val2` ### poll.timeout [Integer] 获取记录时的最大等待时间(毫秒)。更长的时间会增加吞吐量但也会增加延迟。 ### poll.interval [Long] 获取记录时的间隔时间(毫秒)。更短的时间会增加吞吐量,但也会增加 CPU 负载。 ### poll.batch.size [Integer] 轮询时要获取的最大记录数。更长的时间会增加吞吐量但也会增加延迟。 ### cursor.startup.mode [Enum] Pulsar 消费者的启动模式,有效值为 `'EARLIEST'`、`'LATEST'`、`'SUBSCRIPTION'`、`'TIMESTAMP'`。 ### cursor.startup.timestamp [Long] 从指定的纪元时间戳(毫秒)开始。 **注意,当 "cursor.startup.mode" 选项使用 `'TIMESTAMP'` 时,此选项是必需的。** ### cursor.reset.mode [Enum] Pulsar 消费者的游标重置策略,有效值为 `'EARLIEST'`、`'LATEST'`。 **注意,此选项仅在 "cursor.startup.mode" 选项使用 `'SUBSCRIPTION'` 时有效。** ### cursor.stop.mode [String] Pulsar 消费者的停止模式,有效值为 `'NEVER'`、`'LATEST'` 和 `'TIMESTAMP'`。 **注意,当指定 `'NEVER'` 时,这是一个实时作业,其他模式是离线作业。** ### cursor.stop.timestamp [Long] 从指定的纪元时间戳(毫秒)停止。 **注意,当 "cursor.stop.mode" 选项使用 `'TIMESTAMP'` 时,此选项是必需的。** ### schema [Config] 数据的结构,包括字段名称和字段类型。参考 [Schema-Feature](../../introduction/concepts/schema-feature.md) ## format [String] 数据格式。默认格式是 json,参考 [formats](../formats)。 ### 通用选项 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ## 示例 ``` source { Pulsar { topic = "example" subscription.name = "seatunnel" client.service-url = "pulsar://localhost:6650" admin.service-url = "http://my-broker.example.com:8080" plugin_output = "test" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Qdrant.md ================================================ import ChangeLog from '../changelog/connector-qdrant.md'; # Qdrant > Qdrant 数据源连接器 [Qdrant](https://qdrant.tech/) 是一个高性能的向量搜索引擎和向量数据库。 该连接器可用于从 Qdrant 集合中读取数据。 ## 选项 | 名称 | 类型 | 必填 | 默认值 | |-----------------|--------|----|-----------| | collection_name | string | 是 | - | | schema | config | 是 | - | | host | string | 否 | localhost | | port | int | 否 | 6334 | | api_key | string | 否 | - | | use_tls | bool | 否 | false | | common-options | | 否 | - | ### collection_name [string] 要从中读取数据的 Qdrant 集合的名称。 ### schema [config] 要将数据读取到的表的模式。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 例如: ```hocon schema = { fields { age = int address = string some_vector = float_vector } } ``` Qdrant 中的每个条目称为一个点。 `float_vector` 类型的列从每个点的向量中读取,其他列从与该点关联的 JSON 有效负载中读取。 如果列被标记为主键,Qdrant 点的 ID 将写入其中。它可以是 `"string"` 或 `"int"` 类型。因为 Qdrant 仅[允许](https://qdrant.tech/documentation/concepts/points/#point-ids)使用正整数和 UUID 作为点 ID。 如果集合是用单个默认/未命名向量创建的,请使用 `default_vector` 作为向量名称。 ```hocon schema = { fields { age = int address = string default_vector = float_vector } } ``` Qdrant 中点的 ID 将写入标记为主键的列中。它可以是 `int` 或 `string` 类型。 ### host [string] Qdrant 实例的主机名。默认为 "localhost"。 ### port [int] Qdrant 实例的 gRPC 端口。 ### api_key [string] 用于身份验证的 API 密钥(如果设置)。 ### use_tls [bool] 是否使用 TLS(SSL)连接。如果使用 Qdrant 云(https),则需要。 ### 通用选项 源插件的通用参数,请参考[源通用选项](../common-options/source-common-options.md)了解详情。**** ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Rabbitmq.md ================================================ import ChangeLog from '../changelog/connector-rabbitmq.md'; # Rabbitmq > Rabbitmq 源连接器 ## 描述 用于从 Rabbitmq 读取数据。 ## 关键特性 - [ ] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) :::tip 为了实现精确一次,源必须是非并行的(并行度设置为 1)。这个限制主要是由于 RabbitMQ 从单个队列向多个消费者分派消息的方式。 ::: ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |----------------------------|---------|----|-------|-----------------------------------------------------------------------------| | host | string | 是 | - | 连接的默认主机 | | port | int | 是 | - | 连接的默认端口 | | virtual_host | string | 是 | - | 虚拟主机 – 连接到代理时使用的虚拟主机 | | username | string | 是 | - | 连接到代理时使用的 AMQP 用户名 | | password | string | 是 | - | 连接到代理时使用的密码 | | queue_name | string | 是 | - | 要发布消息的队列 | | schema | config | 是 | - | 上游数据的模式。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | url | string | 否 | - | 便捷方法,用于设置 AMQP URI 中的字段:主机、端口、用户名、密码和虚拟主机 | | routing_key | string | 否 | - | 要发布消息的路由密钥 | | exchange | string | 否 | - | 要发布消息的交换机 | | network_recovery_interval | int | 否 | - | 自动恢复在尝试重新连接之前等待多长时间(毫秒) | | topology_recovery_enabled | boolean | 否 | - | 如果为 true,启用拓扑恢复 | | automatic_recovery_enabled | boolean | 否 | - | 如果为 true,启用连接恢复 | | connection_timeout | int | 否 | - | 连接 tcp 建立超时(毫秒);零表示无限 | | requested_channel_max | int | 否 | - | 最初请求的最大通道数;零表示无限制。**注意:值必须在 0 到 65535 之间(AMQP 0-9-1 中的无符号短整数)。 | | requested_frame_max | int | 否 | - | 请求的最大帧大小 | | requested_heartbeat | int | 否 | - | 设置请求的心跳超时。**注意:值必须在 0 到 65535 之间(AMQP 0-9-1 中的无符号短整数)。 | | prefetch_count | int | 否 | - | 预取计数,无需确认即可接收的最大消息数 | | delivery_timeout | long | 否 | - | 交付超时,等待下一条消息交付的最大时间(毫秒) | | durable | boolean | 否 | true | 队列是否在服务器重启时保留 | | exclusive | boolean | 否 | false | 队列是否仅由当前连接使用 | | auto_delete | boolean | 否 | false | 队列是否在最后一个消费者取消订阅时自动删除 | | common-options | | 否 | - | 源插件通用参数 | ### host [string] 连接的默认主机 ### port [int] 连接的默认端口 ### virtual_host [string] 虚拟主机 – 连接到代理时使用的虚拟主机 ### username [string] 连接到代理时使用的 AMQP 用户名 ### password [string] 连接到代理时使用的密码 ### url [string] 便捷方法,用于设置 AMQP URI 中的字段:主机、端口、用户名、密码和虚拟主机 ### queue_name [string] 要发布消息的队列 ### routing_key [string] 要发布消息的路由密钥 ### exchange [string] 要发布消息的交换机 ### schema [Config] #### fields [Config] 上游数据的模式字段。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### network_recovery_interval [int] 自动恢复在尝试重新连接之前等待多长时间(毫秒) ### topology_recovery_enabled [string] 如果为 true,启用拓扑恢复 ### automatic_recovery_enabled [string] 如果为 true,启用连接恢复 ### connection_timeout [int] 连接 tcp 建立超时(毫秒);零表示无限 ### requested_channel_max [int] 最初请求的最大通道数;零表示无限制。**注意:值必须在 0 到 65535 之间(AMQP 0-9-1 中的无符号短整数)。 ### requested_frame_max [int] 请求的最大帧大小 ### requested_heartbeat [int] 设置请求的心跳超时。**注意:值必须在 0 到 65535 之间(AMQP 0-9-1 中的无符号短整数)。 ### prefetch_count [int] 预取计数,无需确认即可接收的最大消息数 ### delivery_timeout [long] 交付超时,等待下一条消息交付的最大时间(毫秒) ### common options 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 ### durable - true:队列将在服务器重启时保留。 - false:队列将在服务器重启时删除。 ### exclusive - true:队列仅由当前连接使用,连接关闭时将删除。 - false:队列可以由多个连接使用。 ### auto-delete - true:队列将在最后一个消费者取消订阅时自动删除。 - false:队列不会自动删除。 ## 示例 简单: ```hocon source { RabbitMQ { host = "rabbitmq-e2e" port = 5672 virtual_host = "/" username = "guest" password = "guest" queue_name = "test" schema = { fields { id = bigint c_map = "map" c_array = "array" } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Redis.md ================================================ import ChangeLog from '../changelog/connector-redis.md'; # Redis > Redis 源连接器 ## 描述 用于从 `Redis` 读取数据 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | |---------------------|--------|--------------------|--------| | host | string | `mode=single`时必须 | - | | port | int | 否 | 6379 | | keys | string | 是 | - | | batch_size | int | 是 | 10 | | data_type | string | 是 | - | | user | string | 否 | - | | auth | string | 否 | - | | db_num | int | 否 | 0 | | mode | string | 否 | single | | hash_key_parse_mode | string | 否 | all | | nodes | list | `mode=cluster` 时必须 | - | | schema | config | `format=json` 时必须 | - | | format | string | 否 | json | | field_delimiter | string | 否 | ',' | | common-options | | 否 | - | ### host [string] redis 主机地址 ### port [int] redis 端口号 ### hash_key_parse_mode [string] 指定 hash key 解析模式, 支持 `all` `kv` 模式, 用于设定连接器如何解析 hash key。 当设定为 `all` 时,连接器会将 hash key 的值视为一行并根据 schema config 配置进行解析,当设定为 `kv` 时,连接器会将 hash key 的每个 kv 视为一行,并根据 schema config 进行解析。 例如,如果 hash key 的值如下设置: ```text { "001": { "name": "tyrantlucifer", "age": 26 }, "002": { "name": "Zongwen", "age": 26 } } ``` 如果 `hash_key_parse_mode` 设置为 `all` 模式,且 schema config 如下所示,将会生成下表数据: ```hocon schema { fields { 001 { name = string age = int } 002 { name = string age = int } } } ``` | 001 | 002 | | ------------------------------- | ------------------------- | | Row(name=tyrantlucifer, age=26) | Row(name=Zongwen, age=26) | 如果 `hash_key_parse_mode` 设置为 `kv` 模式,且 schema config 如下所示,将会生成下表数据: ```hocon schema { fields { hash_key = string name = string age = int } } ``` | hash_key | name | age | | -------- | ------------- | ---- | | 001 | tyrantlucifer | 26 | | 002 | Zongwen | 26 | hash key 中的每个 kv 将会被视为一行并被发送给上游。 **提示:连接器将使用 scheme config 的第一个字段信息作为每个 kv 中每个 k 的字段名称** ### keys [string] keys 模式 ### batch_size [int] 表示每次迭代尝试返回的键的数量,默认值为 10。 **提示:Redis 连接器支持模糊键匹配,用户需要确保匹配的键类型相同** ### data_type [string] redis 数据类型, 支持 `key` `hash` `list` `set` `zset`。 - key > 将每个 key 的值将作为单行数据发送给下游。 > 例如,key 对应的值为 `SeaTunnel test message`,则下游接收到的数据为 `SeaTunnel test message`,并且仅会收到一条信息。 - hash > hash 键值对将会被格式化为 json,并以单行数据的形式发送给下游。 > 例如,hash 值为 `name:tyrantlucifer age:26`,则下游接收到的数据为 `{"name":"tyrantlucifer", "age":"26"}`,并且仅会收到一条信息。 - list > list 中的每个元素都将作为单行数据向下游发送。 > 例如,list 值为 `[tyrantlucier, CalvinKirs]`,则下游接收到的数据为 `tyrantlucifer` 和 `CalvinKirs`,并且仅会收到两条信息。 - set > set 中的每个元素都将作为单行数据向下游发送。 > 例如,set 值为 `[tyrantlucier, CalvinKirs]`,则下游接收到的数据为 `tyrantlucifer` 和 `CalvinKirs`,并且仅会收到两条信息。 - zset > zset 中的每个元素都将作为单行数据向下游发送。 > 例如,zset 值为 `[tyrantlucier, CalvinKirs]`,则下游接收到的数据为 `tyrantlucifer` 和 `CalvinKirs`,并且仅会收到两条信息。 ### user [string] Redis 认证身份用户,当连接到加密集群时需要使用 ### auth [string] Redis 认证密钥,当连接到加密集群时需要使用 ### db_num [int] Redis 数据库索引 ID,默认将连接到 db 0 ### mode [string] Redis 模式,`single` 或 `cluster`,默认值为 `single` ### nodes [list] Redis 节点信息,在 cluster 模式下使用,必须设置为以下格式: ["host1:port1", "host2:port2"] ### format [string] 上游数据格式,目前仅支持 `json` `text`,默认为 `json` 当指定格式为 `json` 时,还需要指定 scheme option,例如: 当上游数据如下时: ```json {"code": 200, "data": "get success", "success": true} ``` 需要指定 schema 为如下配置: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将会生成如下格式数据: | code | data | success | | ---- | ----------- | ------- | | 200 | get success | true | 当指定格式为 `text` 时,可以选择是否指定schema参数。 例如, 当上游数据如下时: ```text 200#get success#true ``` 如果不指定schema参数,连接器将按照以下方式处理上游数据: | content | | -------------------------------------------------------- | | 200#get success#true | 如果指定schema参数,此时需要同时配置`schema`和`field_delimiter`,如下所示: ```hocon field_delimiter = "#" schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | content | | -------------------------------------------------------- | | {"code": 200, "data": "get success", "success": true} | ### field_delimiter [string] 字段分隔符,用于告诉连接器如何分割字段。 目前仅当格式为text时需要配置。默认为","。 ### schema [config] #### fields [config] Redis 数据的 schema 字段列表。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ### common options 源连接器插件通用参数,详情请参见 [Source Common Options](../common-options/source-common-options.md) ## 示例 简单使用示例: ```hocon Redis { host = localhost port = 6379 keys = "key_test*" data_type = key format = text } ``` ```hocon Redis { host = localhost port = 6379 keys = "key_test*" data_type = key format = json schema { fields { name = string age = int } } } ``` 读取 string 类型并附加到 list 示例: ```hocon source { Redis { host = "redis-e2e" port = 6379 auth = "U2VhVHVubmVs" keys = "string_test*" data_type = string batch_size = 33 } } sink { Redis { host = "redis-e2e" port = 6379 auth = "U2VhVHVubmVs" key = "string_test_list" data_type = list batch_size = 33 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Redshift.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Redshift > JDBC Redshift 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持这些引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL 并可以实现投影效果。 ## 支持的数据源列表 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | redshift | 不同的依赖版本有不同的驱动类 | com.amazon.redshift.jdbc.Driver | jdbc:redshift://localhost:5439/database | [下载](https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如 Redshift 数据源:cp RedshiftJDBC42-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | Redshift 数据类型 | SeaTunnel 数据类型 | |------------------|------------------| | SMALLINT
    INT2 | SHORT | | INTEGER
    INT
    INT4 | INT | | BIGINT
    INT8
    OID | LONG | | DECIMAL
    NUMERIC | DECIMAL | | REAL
    FLOAT4 | FLOAT | | DOUBLE_PRECISION
    FLOAT8
    FLOAT | DOUBLE | | BOOLEAN
    BOOL | BOOLEAN | | CHAR
    CHARACTER
    NCHAR
    BPCHAR
    VARCHAR
    CHARACTER_VARYING
    NVARCHAR
    TEXT
    SUPER | STRING | | VARBYTE
    BINARY_VARYING | BYTES | | TIME
    TIME_WITH_TIME_ZONE
    TIMETZ | LOCALTIME | | TIMESTAMP
    TIMESTAMP_WITH_OUT_TIME_ZONE
    TIMESTAMPTZ | LOCALDATETIME | ## 示例 ### 简单 > 此示例在单个并行中查询您的测试"数据库"中的 type_bin 表的 16 条数据,并查询其所有字段。您也可以指定要查询的字段以最终输出到控制台。 ``` env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:redshift://localhost:5439/dev" driver = "com.amazon.redshift.jdbc.Driver" username = "root" password = "123456" table_path = "public.table2" # 使用查询过滤行和列 query = "select id, name from public.table2 where id > 100" #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ### 多表读取 ***配置 `table_list` 将打开自动分割,您可以配置 `split.*` 来调整分割策略*** ```hocon env { job.mode = "BATCH" parallelism = 2 } source { Jdbc { url = "jdbc:redshift://localhost:5439/dev" driver = "com.amazon.redshift.jdbc.Driver" username = "root" password = "123456" table_list = [ { table_path = "public.table1" }, { table_path = "public.table2" # 使用查询过滤行和列 query = "select id, name from public.table2 where id > 100" } ] #split.size = 8096 #split.even-distribution.factor.upper-bound = 100 #split.even-distribution.factor.lower-bound = 0.05 #split.sample-sharding.threshold = 1000 #split.inverse-sampling.rate = 1000 } } sink { Console {} } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/RocketMQ.md ================================================ import ChangeLog from '../changelog/connector-rocketmq.md'; # RocketMQ > RocketMQ 源连接器 ## 支持的 Apache RocketMQ 版本 - 4.9.0(或更新版本,供参考) ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 Apache RocketMQ 的源连接器。 ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-------------------------------------|---------|----|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------| | topics | String | 是 | - | RocketMQ 主题名称。如果有多个主题,使用 `,` 分隔,例如:`"tpc1,tpc2"`。 | | name.srv.addr | String | 是 | - | RocketMQ 名称服务器集群地址。 | | tags | String | 否 | - | RocketMQ 标签名称。如果有多个标签,使用 `,` 分隔,例如:`"tag1,tag2"`。 | | acl.enabled | Boolean | 否 | false | 如果为 true,启用访问控制,需要配置访问密钥和秘密密钥。 | | access.key | String | 否 | | 访问密钥 | | secret.key | String | 否 | | 当 ACL_ENABLED 为 true 时,秘密密钥不能为空。 | | batch.size | int | 否 | 100 | RocketMQ 消费者拉取批大小 | | consumer.group | String | 否 | SeaTunnel-Consumer-Group | RocketMQ 消费者组 ID,用于区分不同的消费者组。 | | commit.on.checkpoint | Boolean | 否 | true | 如果为 true,消费者的偏移量将在后台定期提交。 | | schema | | 否 | - | 数据的结构,包括字段名称和字段类型。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | format | String | 否 | json | 数据格式。默认格式是 json。可选 text 格式。默认字段分隔符是 ","。如果自定义分隔符,添加 "field.delimiter" 选项。 | | field.delimiter | String | 否 | , | 自定义数据格式的字段分隔符 | | start.mode | String | 否 | CONSUME_FROM_GROUP_OFFSETS | 消费者的初始消费模式,有几种类型:[CONSUME_FROM_LAST_OFFSET],[CONSUME_FROM_FIRST_OFFSET],[CONSUME_FROM_GROUP_OFFSETS],[CONSUME_FROM_TIMESTAMP],[CONSUME_FROM_SPECIFIC_OFFSETS] | | start.mode.offsets | | 否 | | 消费模式为 "CONSUME_FROM_SPECIFIC_OFFSETS" 所需的偏移量 | | start.mode.timestamp | Long | 否 | | 消费模式为 "CONSUME_FROM_TIMESTAMP" 所需的时间。 | | partition.discovery.interval.millis | long | 否 | -1 | 动态发现主题和分区的间隔。 | | ignore_parse_errors | Boolean | 否 | false | 可选标志,跳过解析错误而不是失败。 | | common-options | config | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### start.mode.offsets 消费模式为 "CONSUME_FROM_SPECIFIC_OFFSETS" 所需的偏移量。 例如: ```hocon start.mode.offsets = { topic1-0 = 70 topic1-1 = 10 topic1-2 = 10 } ``` ## 任务示例 ### 简单 > 消费者读取 Rocketmq 数据并将其打印到控制台 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Rocketmq { name.srv.addr = "rocketmq-e2e:9876" topics = "test_topic_json" plugin_output = "rocketmq_table" schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/category/transform } sink { Console { } } ``` ### 指定格式消费简单 > 当我以 json 格式消费主题数据并解析,每次拉取的条数是 400,消费从原始位置开始 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Rocketmq { name.srv.addr = "localhost:9876" topics = "test_topic" plugin_output = "rocketmq_table" start.mode = "CONSUME_FROM_FIRST_OFFSET" batch.size = "400" consumer.group = "test_topic_group" format = json schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/category/transform } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/S3File.md ================================================ import ChangeLog from '../changelog/connector-file-s3.md'; # S3File > S3文件数据源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) 在一次pollNext调用中读取分片中的所有数据。将读取的分片保存在快照中。 - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] parquet - [x] orc - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 从aws s3文件系统读取数据。 ## 支持的数据源信息 | 数据源 | 支持的版本 | |------------|--------------------| | S3 | current | ## 依赖 > 如果您使用spark/flink,为了使用此连接器,您必须确保您的spark/flink集群已经集成了hadoop。测试过的hadoop版本是2.x。
    > > 如果您使用SeaTunnel Zeta,它在您下载和安装SeaTunnel Zeta时会自动集成hadoop jar。您可以检查${SEATUNNEL_HOME}/lib下的jar包来确认这一点。
    > 要使用此连接器,您需要将hadoop-aws-3.1.4.jar和aws-java-sdk-bundle-1.12.692.jar放在${SEATUNNEL_HOME}/lib目录中。 ## 数据类型映射 数据类型映射与正在读取的文件类型相关,我们支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` ### JSON文件类型 如果您将文件类型指定为`json`,您还应该指定schema选项来告诉连接器如何将数据解析为您想要的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您也可以在一个文件中保存多条数据,并用换行符分隔: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` 您应该按如下方式指定schema: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | ### 文本或CSV文件类型 如果您将`file_format_type`设置为`text`、`excel`、`csv`、`xml`。那么需要设置`schema`字段来告诉连接器如何将数据解析为行。 如果您设置了`schema`字段,您还应该设置选项`field_delimiter`,除非`file_format_type`是`csv`、`xml`、`excel` 您可以按如下方式设置schema和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将生成如下数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | ### Orc文件类型 如果您将文件类型指定为`parquet` `orc`,则不需要schema选项,连接器可以自动找到上游数据的schema。 | Orc数据类型 | SeaTunnel数据类型 | |----------------------------------|-------------------------------| | BOOLEAN | BOOLEAN | | INT | INT | | BYTE | BYTE | | SHORT | SHORT | | LONG | LONG | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BINARY | BINARY | | STRING
    VARCHAR
    CHAR
    | STRING | | DATE | LOCAL_DATE_TYPE | | TIMESTAMP | LOCAL_DATE_TIME_TYPE | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType,K和V的类型将转换为SeaTunnel类型 | | STRUCT | SeaTunnelRowType | ### Parquet文件类型 如果您将文件类型指定为`parquet` `orc`,则不需要schema选项,连接器可以自动找到上游数据的schema。 | Parquet数据类型 | SeaTunnel数据类型 | |----------------------|-------------------------------| | INT_8 | BYTE | | INT_16 | SHORT | | DATE | DATE | | TIMESTAMP_MILLIS | TIMESTAMP | | INT64 | LONG | | INT96 | TIMESTAMP | | BINARY | BYTES | | FLOAT | FLOAT | | DOUBLE | DOUBLE | | BOOLEAN | BOOLEAN | | FIXED_LEN_BYTE_ARRAY | TIMESTAMP
    DECIMAL | | DECIMAL | DECIMAL | | LIST(STRING) | STRING_ARRAY_TYPE | | LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | | LIST(TINYINT) | BYTE_ARRAY_TYPE | | LIST(SMALLINT) | SHORT_ARRAY_TYPE | | LIST(INT) | INT_ARRAY_TYPE | | LIST(BIGINT) | LONG_ARRAY_TYPE | | LIST(FLOAT) | FLOAT_ARRAY_TYPE | | LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | | Map | MapType,K和V的类型将转换为SeaTunnel类型 | | STRUCT | SeaTunnelRowType | ## 选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |---------------------------------|---------|------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | path | string | 是 | - | 需要读取的s3路径,可以有子路径,但子路径需要满足一定的格式要求。具体要求可以参考"parse_partition_from_path"选项 | | file_format_type | string | 是 | - | 文件类型,支持以下文件类型:`text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` | | bucket | string | 是 | - | s3文件系统的bucket地址,例如:`s3n://seatunnel-test`,如果您使用`s3a`协议,此参数应为`s3a://seatunnel-test`。 | | fs.s3a.endpoint | string | 是 | - | fs s3a端点 | | fs.s3a.aws.credentials.provider | string | 是 | com.amazonaws.auth.InstanceProfileCredentialsProvider | s3a的认证方式。我们目前只支持`org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider`和`com.amazonaws.auth.InstanceProfileCredentialsProvider`。有关凭据提供程序的更多信息,您可以查看[Hadoop AWS文档](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) | | read_columns | list | 否 | - | 数据源的读取列列表,用户可以使用它来实现字段投影。支持列投影的文件类型如下所示:`text` `csv` `parquet` `orc` `json` `excel` `xml`。如果用户想在读取`text` `json` `csv`文件时使用此功能,必须配置"schema"选项。 | | access_key | string | 否 | - | 仅在`fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider`时使用 | | secret_key | string | 否 | - | 仅在`fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider`时使用 | | hadoop_s3_properties | map | 否 | - | 如果您需要添加其他选项,可以在此处添加并参考此[链接](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | | delimiter/field_delimiter | string | 否 | \001 | 字段分隔符,用于告诉连接器在读取文本文件时如何切分字段。默认`\001`,与hive的默认分隔符相同。 | | row_delimiter | string | 否 | \n | 行分隔符,用于告诉连接器在读取文本文件时如何切分行。默认`\n`。 | | | parse_partition_from_path | boolean | 否 | true | 控制是否从文件路径解析分区键和值。例如,如果您从路径`s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`读取文件。文件中的每条记录数据都将添加这两个字段:name="tyrantlucifer",age=16 | | date_format | string | 否 | yyyy-MM-dd | 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`。默认`yyyy-MM-dd` | | datetime_format | string | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | | time_format | string | 否 | HH:mm:ss | 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式:`HH:mm:ss` `HH:mm:ss.SSS` | | skip_header_row_number | long | 否 | 0 | 跳过前几行,但仅适用于txt和csv。例如,设置如下:`skip_header_row_number = 2`。然后SeaTunnel将跳过源文件的前2行 | | csv_use_header_line | boolean | 否 | false | 是否使用标题行来解析文件,仅在file_format为`csv`且文件包含符合RFC 4180的标题行时使用 | | schema | config | 否 | - | 上游数据的schema。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | | sheet_name | string | 否 | - | 读取工作簿的工作表,仅在file_format为excel时使用。 | | xml_row_tag | string | 否 | - | 指定XML文件中数据行的标签名称,仅对XML文件有效。 | | xml_use_attr_format | boolean | 否 | - | 指定是否使用标签属性格式处理数据,仅对XML文件有效。 | | compress_codec | string | 否 | none | | | archive_compress_codec | string | 否 | none | | | enable_file_split | boolean | 否 | false | 开启大文件拆分以提升并行度。仅支持 `text`/`csv`/`json`/`parquet` 且非压缩格式(`compress_codec=none` 且 `archive_compress_codec=none`)。 | | file_split_size | long | 否 | 134217728 | `enable_file_split=true` 时生效,单位字节。`text`/`csv`/`json` 按 `file_split_size` 拆分并对齐到下一个 `row_delimiter`;`parquet` 以 RowGroup 为拆分单位,不会切开 RowGroup。 | | encoding | string | 否 | UTF-8 | | | null_format | string | 否 | - | 仅在file_format_type为text时使用。null_format用于定义哪些字符串可以表示为null。例如:`\N` | | binary_chunk_size | int | 否 | 1024 | 仅在file_format_type为binary时使用。读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 | | binary_complete_file_mode | boolean | 否 | false | 仅在file_format_type为binary时使用。是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 | | file_filter_pattern | string | 否 | | 过滤模式,用于过滤文件。 | | filename_extension | string | 否 | - | 过滤文件名扩展名,用于过滤具有特定扩展名的文件。例如:`csv` `.txt` `json` `.xml`。 | | common-options | | 否 | - | 数据源插件通用参数,请参考[数据源通用选项](../common-options/source-common-options.md)了解详情。 | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | | metalake_type | string | 否 | gravitino | Metalake 服务类型,目前支持 `gravitino`。 | ### delimiter/field_delimiter [string] **delimiter**参数将在2.3.5版本后弃用,请使用**field_delimiter**代替。 ### row_delimiter [string] 仅在 file_format 为 text 时需要配置。 行分隔符,用于告诉连接器如何分割行。 默认 `\n`。 ### quote_char [string] 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 ### escape_char [string] 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考 https://en.wikipedia.org/wiki/Regular_expression。 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例1**:*匹配所有.txt文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果是: ``` /data/seatunnel/20241001/report.txt ``` **示例2**:*匹配所有以abc开头的文件*,正则表达式: ``` abc.* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例4**:*匹配以202410开头的第三级文件夹和以.csv结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### enable_file_split [boolean] 开启大文件拆分功能,默认 false。仅支持 `csv`/`text`/`json`/`parquet` 且非压缩格式(`compress_codec=none` 且 `archive_compress_codec=none`)。 - `text`/`csv`/`json`:按 `file_split_size` 拆分并对齐到下一个 `row_delimiter`,避免切开一行/一条记录。 - `parquet`:以 RowGroup 为逻辑拆分单位,不会切开 RowGroup。 **使用建议** - 适合:读取少量大文件,并希望通过更高并行度提升吞吐。 - 不建议:读取大量小文件,或并行度较低的场景(拆分会带来额外的枚举/调度开销)。 **限制说明** - 不支持压缩文件(`compress_codec` != `none`)或归档文件(`archive_compress_codec` != `none`),会自动回退为不拆分,并打印 WARN 日志提示。 - 对于 `text`/`csv`/`json`,实际 split 的大小可能略大于 `file_split_size`(因为需要对齐到下一个 `row_delimiter`)。 - 对于 `json`,仅支持 JSON Lines(每行一个 JSON 对象)的切分读取。 - 启用切分后,数据全局顺序不保证(split 可能并行处理导致输出顺序交错)。如需严格有序,请设置 `parallelism=1` 或关闭切分。 ### file_split_size [long] `enable_file_split=true` 时生效,单位字节。默认 128MB(134217728)。 **调优建议** - 建议从默认值(128MB)开始:如果并行度未充分利用可适当调小;如果 split 数量过多可适当调大。 - 经验公式:`file_split_size ≈ file_size / 期望并行度`。 ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器,支持的详细信息如下所示: | archive_compress_codec | file_format | archive_compress_suffix | |------------------------|------------|-------------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz压缩的excel文件需要压缩原始文件或指定文件后缀,例如e2e.xls ->e2e_test.xls.gz ### encoding [string] 仅在file_format_type为json、text、csv、xml时使用。 要读取的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### binary_chunk_size [int] 仅在file_format_type为binary时使用。 读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在file_format_type为binary时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 ### schema [config] 仅在文件格式类型为 text、json、excel、xml 或 csv(或其他无法从元数据中读取 schema 的格式)时需要配置。 上游数据的 schema 信息。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 ### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ## 示例 1. 在此示例中,我们从s3路径`s3a://seatunnel-test/seatunnel/text`读取数据,此路径中的文件类型是orc。 我们使用`org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider`进行身份验证,因此需要`access_key`和`secret_key`。 文件中的所有列都将被读取并发送到接收器。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" bucket = "s3a://seatunnel-test" file_format_type = "orc" } } transform { # 如果您想获取有关如何配置seatunnel和查看转换插件完整列表的更多信息, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { Console {} } ``` 2. 使用`InstanceProfileCredentialsProvider`进行身份验证 S3中的文件类型是json,因此需要配置schema选项。 ```hocon S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" schema { fields { id = int name = string } } } ``` 3. 使用`InstanceProfileCredentialsProvider`进行身份验证 S3中的文件类型是json,有五个字段(`id`、`name`、`age`、`sex`、`type`),因此需要配置schema选项。 在此作业中,我们只需要将`id`和`name`列发送到mysql。 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" read_columns = ["id", "name"] schema { fields { id = int name = string age = int sex = int type = string } } } } transform { # 如果您想获取有关如何配置seatunnel和查看转换插件完整列表的更多信息, # 请访问 https://seatunnel.apache.org/docs/transforms } sink { Console {} } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { S3File { path = "/seatunnel/json" bucket = "s3a://seatunnel-test" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" file_format_type = "json" read_columns = ["id", "name"] // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/SftpFile.md ================================================ import ChangeLog from '../changelog/connector-file-sftp.md'; # SftpFile > Sftp文件数据源连接器 ## 支持的引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 主要特性 - [x] [多模态](../../introduction/concepts/connector-v2-features.md#多模态multimodal) 使用二进制文件格式读取和写入任何格式的文件,例如视频、图片等。简而言之,任何文件都可以同步到目标位置。 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的分片](../../introduction/concepts/connector-v2-features.md) - [x] 文件格式类型 - [x] text - [x] csv - [x] json - [x] excel - [x] xml - [x] binary - [x] markdown ## 描述 从sftp文件服务器读取数据。 ## 支持的数据源信息 为了使用SftpFile连接器,需要以下依赖项。 可以通过install-plugin.sh或从Maven中央仓库下载。 | 数据源 | 支持的版本 | 依赖 | |------------|--------------------|-----------------------------------------------------------------------------------------| | SftpFile | universal | [下载](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-file-sftp) | :::tip 如果您使用spark/flink,为了使用此连接器,您必须确保您的spark/flink集群已经集成了hadoop。测试过的hadoop版本是2.x。 如果您使用SeaTunnel引擎,它在您下载和安装SeaTunnel引擎时会自动集成hadoop jar。您可以检查${SEATUNNEL_HOME}/lib下的jar包来确认这一点。 为了支持更多文件类型,我们做了一些权衡,因此我们使用HDFS协议进行内部访问Sftp,此连接器需要一些hadoop依赖项。 它只支持hadoop版本**2.9.X+**。 ::: ## 数据类型映射 文件没有特定的类型列表,我们可以通过在配置中指定Schema来指示相应的数据需要转换为哪种SeaTunnel数据类型。 | SeaTunnel数据类型 | |---------------------| | STRING | | SHORT | | INT | | BIGINT | | BOOLEAN | | DOUBLE | | DECIMAL | | FLOAT | | DATE | | TIME | | TIMESTAMP | | BYTES | | ARRAY | | MAP | ## 数据源选项 | 名称 | 类型 | 是否必需 | 默认值 | 描述 | |----------------------------|---------|------|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | host | String | 是 | - | 目标sftp主机是必需的 | | port | Int | 是 | - | 目标sftp端口是必需的 | | user | String | 是 | - | 目标sftp用户名是必需的 | | password | String | 是 | - | 目标sftp密码是必需的 | | path | String | 是 | - | 源文件路径。 | | file_format_type | String | 是 | - | 请查看下面的#file_format_type | | file_filter_pattern | String | 否 | - | 过滤模式,用于过滤文件。 | | filename_extension | string | 否 | - | 过滤文件名扩展名,用于过滤具有特定扩展名的文件。例如:`csv` `.txt` `json` `.xml`。 | | delimiter/field_delimiter | String | 否 | \001 | **delimiter**参数将在2.3.5版本后弃用,请使用**field_delimiter**代替。
    字段分隔符,用于告诉连接器在读取文本文件时如何切分字段。
    默认`\001`,与hive的默认分隔符相同 | | row_delimiter | string | 否 | \n | 行分隔符,用于告诉连接器在读取文本文件时如何切分行。
    默认`\n`。 | | | parse_partition_from_path | Boolean | 否 | true | 控制是否从文件路径解析分区键和值
    例如,如果您从路径`oss://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`读取文件
    文件中的每条记录数据都将添加这两个字段:
    name age
    tyrantlucifer 26
    提示:**不要在schema选项中定义分区字段** | | date_format | String | 否 | yyyy-MM-dd | 日期类型格式,用于告诉连接器如何将字符串转换为日期,支持以下格式:
    `yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
    默认`yyyy-MM-dd` | | datetime_format | String | 否 | yyyy-MM-dd HH:mm:ss | 日期时间类型格式,用于告诉连接器如何将字符串转换为日期时间,支持以下格式:
    `yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
    默认`yyyy-MM-dd HH:mm:ss` | | time_format | String | 否 | HH:mm:ss | 时间类型格式,用于告诉连接器如何将字符串转换为时间,支持以下格式:
    `HH:mm:ss` `HH:mm:ss.SSS`
    默认`HH:mm:ss` | | skip_header_row_number | Long | 否 | 0 | 跳过前几行,但仅适用于txt和csv。
    例如,设置如下:
    `skip_header_row_number = 2`
    然后SeaTunnel将跳过源文件的前2行 | | read_columns | list | 否 | - | 数据源的读取列列表,用户可以使用它来实现字段投影。 | | sheet_name | String | 否 | - | 读取工作簿的工作表,仅在file_format为excel时使用。 | | xml_row_tag | string | 否 | - | 指定XML文件中数据行的标签名称,仅在file_format为xml时使用。 | | xml_use_attr_format | boolean | 否 | - | 指定是否使用标签属性格式处理数据,仅在file_format为xml时使用。 | | csv_use_header_line | boolean | 否 | false | 是否使用标题行来解析文件,仅在file_format为`csv`且文件包含符合RFC 4180的标题行时使用 | | schema | Config | 否 | - | 请查看下面的#schema | | compress_codec | String | 否 | None | 文件的压缩编解码器,支持的详细信息如下所示:
    - txt: `lzo` `None`
    - json: `lzo` `None`
    - csv: `lzo` `None`
    - orc: `lzo` `snappy` `lz4` `zlib` `None`
    - parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `None`
    提示:excel类型不支持任何压缩格式 | | archive_compress_codec | string | 否 | none | | encoding | string | 否 | UTF-8 | | null_format | string | 否 | - | 仅在file_format_type为text时使用。null_format用于定义哪些字符串可以表示为null。例如:`\N` | | binary_chunk_size | int | 否 | 1024 | 仅在file_format_type为binary时使用。读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 | | binary_complete_file_mode | boolean | 否 | false | 仅在file_format_type为binary时使用。是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 | | sync_mode | string | 否 | full | 文件同步模式,支持:`full`(默认)、`update`。当 `update` 时,对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 | | target_path | string | 否 | - | 仅在 `sync_mode=update` 时使用。目标端基础路径(通常应与 sink 的 `path` 一致),用于对比同相对路径文件。 | | target_hadoop_conf | map | 否 | - | 仅在 `sync_mode=update` 时使用。目标端 Hadoop 配置(可选),可在其中设置 `fs.defaultFS` 覆盖目标 defaultFS。 | | update_strategy | string | 否 | distcp | 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)、`strict`。 | | compare_mode | string | 否 | len_mtime | 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)、`checksum`(仅在 `update_strategy=strict` 时可用)。 | | common-options | | 否 | - | 数据源插件通用参数,请参考[数据源通用选项](../common-options/source-common-options.md)了解详情。 | | file_filter_modified_start | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的开始时间(包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | file_filter_modified_end | string | 否 | - | 按照最后修改时间过滤文件。 要过滤的结束时间(不包括改时间),时间格式是:`yyyy-MM-dd HH:mm:ss` | | quote_char | string | 否 | " | 用于包裹 CSV 字段的单字符,可保证包含逗号、换行符或引号的字段被正确解析。 | | escape_char | string | 否 | - | 用于在 CSV 字段内转义引号或其他特殊字符,使其不会结束字段。 | | metalake_type | string | 否 | gravitino | Metalake 服务类型,目前支持 `gravitino`。 | ### file_filter_pattern [string] 文件过滤模式,用于过滤文件。若只想根据文件名称筛选,则直接写文件名称的正则;若同时想根据文件目录进行过滤,则表达式以`path`起始。 该模式遵循标准正则表达式。详情请参考 https://en.wikipedia.org/wiki/Regular_expression。 以下是一些示例。 若`path`为`/data/seatunnel`,且文件结构示例: ``` /data/seatunnel/20241001/report.txt /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv /data/seatunnel/20241012/logo.png ``` 匹配规则示例: **示例1**:*匹配所有.txt文件*,正则表达式: ``` .*.txt ``` 此示例匹配的结果是: ``` /data/seatunnel/20241001/report.txt ``` **示例2**:*匹配所有以abc开头的文件*,正则表达式: ``` abc.* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv ``` **示例3**:*匹配20241007文件夹下所有以 abc 开头的文件,且第四个字符为 h 或 g*,正则表达式: ``` /data/seatunnel/20241007/abc[h,g].* ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv ``` **示例4**:*匹配以202410开头的第三级文件夹和以.csv结尾的文件*,正则表达式: ``` /data/seatunnel/202410\d*/.*.csv ``` 此示例匹配的结果是: ``` /data/seatunnel/20241007/abch202410.csv /data/seatunnel/20241002/abcg202410.csv /data/seatunnel/20241005/old_data.csv ``` ### file_format_type [string] 文件类型,支持以下文件类型: `text` `csv` `parquet` `orc` `json` `excel` `xml` `binary` `markdown` 如果您将文件类型指定为`json`,您还应该指定schema选项来告诉连接器如何将数据解析为您想要的行。 例如: 上游数据如下: ```json {"code": 200, "data": "get success", "success": true} ``` 您也可以在一个文件中保存多条数据,并用换行符分隔: ```json lines {"code": 200, "data": "get success", "success": true} {"code": 300, "data": "get failed", "success": false} ``` 您应该按如下方式指定schema: ```hocon schema { fields { code = int data = string success = boolean } } ``` 连接器将生成如下数据: | code | data | success | |------|-------------|---------| | 200 | get success | true | 如果您将文件类型指定为`parquet` `orc`,则不需要schema选项,连接器可以自动找到上游数据的schema。 如果您将文件类型指定为`text` `csv`,您可以选择指定schema信息或不指定。 例如,上游数据如下: ```text tyrantlucifer#26#male ``` 如果您不指定数据schema,连接器将把上游数据视为如下: | content | |-----------------------| | tyrantlucifer#26#male | 如果您指定数据schema,除了CSV文件类型外,您还应该指定选项`field_delimiter` 您应该按如下方式指定schema和分隔符: ```hocon field_delimiter = "#" schema { fields { name = string age = int gender = string } } ``` 连接器将生成如下数据: | name | age | gender | |---------------|-----|--------| | tyrantlucifer | 26 | male | 如果您将文件类型指定为`binary`,SeaTunnel可以同步任何格式的文件, 例如压缩包、图片等。简而言之,任何文件都可以同步到目标位置。 如果您将文件类型指定为 `markdown`,SeaTunnel 可以解析 markdown 文件并提取结构化数据。 markdown 解析器提取各种元素,包括标题、段落、列表、代码块、表格等。 每个元素都转换为具有以下架构的行: - `element_id`:元素的唯一标识符 - `element_type`:元素类型(Heading、Paragraph、ListItem 等) - `heading_level`:标题级别(1-6,非标题元素为 null) - `text`:元素的文本内容 - `page_number`:页码(默认:1) - `position_index`:文档中的位置索引 - `parent_id`:父元素的 ID - `child_ids`:子元素 ID 的逗号分隔列表 注意:Markdown 格式仅支持读取,不支持写入。 在此要求下,您需要确保源和接收器同时使用`binary`格式进行文件同步。 ### compress_codec [string] 文件的压缩编解码器,支持的详细信息如下所示: - txt: `lzo` `none` - json: `lzo` `none` - csv: `lzo` `none` - orc/parquet: 自动识别压缩类型,无需额外设置。 ### archive_compress_codec [string] 归档文件的压缩编解码器,支持的详细信息如下所示: | archive_compress_codec | file_format | archive_compress_suffix | |--------------------|--------------------|---------------------| | ZIP | txt,json,excel,xml | .zip | | TAR | txt,json,excel,xml | .tar | | TAR_GZ | txt,json,excel,xml | .tar.gz | | GZ | txt,json,excel,xml | .gz | | NONE | all | .* | 注意:gz压缩的excel文件需要压缩原始文件或指定文件后缀,例如e2e.xls ->e2e_test.xls.gz ### encoding [string] 仅在file_format_type为json、text、csv、xml时使用。 要读取的文件的编码。此参数将由`Charset.forName(encoding)`解析。 ### binary_chunk_size [int] 仅在file_format_type为binary时使用。 读取二进制文件的块大小(以字节为单位)。默认为1024字节。较大的值可能会提高大文件的性能,但会使用更多内存。 ### binary_complete_file_mode [boolean] 仅在file_format_type为binary时使用。 是否将完整文件作为单个块读取,而不是分割成块。启用时,整个文件内容将一次性读入内存。默认为false。 ### sync_mode [string] 文件同步模式,支持:`full`(默认)、`update`。 当 `update` 时,对源/目标进行对比,只读取新增/变更文件(目前仅支持 `file_format_type=binary`)。 **性能注意事项** - Update 模式会对每个源文件额外发起一次到目标端的 `getFileStatus` 用于对比。 - 对于远程文件系统(FTP/SFTP),会带来按文件的网络开销,不建议用于海量小文件场景。 **要求 / 限制** - `target_path` 通常应与 sink 的 `path` 一致(同一文件系统且相对路径结构一致)。 - 使用 `update_strategy=distcp` 时,依赖源/目标端时钟同步,否则可能误判。 - 使用 `compare_mode=checksum` 时,需要文件系统支持 checksum;若无法获取 checksum,SeaTunnel 会降级为内容比较(开销更大)并打印告警日志。 示例: ```hocon sync_mode = "update" file_format_type = "binary" target_path = "/path/to/your/sink/path" update_strategy = "distcp" compare_mode = "len_mtime" ``` ### target_path [string] 仅在 `sync_mode=update` 时使用。目标端基础路径(通常应与 sink 的 `path` 一致),用于对比同相对路径文件。 ### target_hadoop_conf [map] 仅在 `sync_mode=update` 时使用。目标端 Hadoop 配置(可选),可在其中设置 `fs.defaultFS` 覆盖目标 defaultFS。 ### update_strategy [string] 仅在 `sync_mode=update` 时使用。支持:`distcp`(默认)、`strict`。 ### compare_mode [string] 仅在 `sync_mode=update` 时使用。支持:`len_mtime`(默认)、`checksum`(仅在 `update_strategy=strict` 时可用)。 ### schema [config] #### fields [Config] 上游数据的schema。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 #### schema_url [string] 通过 restApi 获取元数据信息的 http url,例如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](../../introduction/concepts/gravitino-type-mapping.md)。 #### metalake_type [string] Metalake 服务类型,目前仅支持 `gravitino`。当使用 `schema_url` 从 Gravitino 获取元数据时,可以指定此参数(默认为 `gravitino`)。 有关 Metalake 的更多信息,请参考 [Metalake](../../introduction/concepts/metalake.md)。 ## 如何创建Sftp数据同步作业 以下示例演示如何创建从sftp读取数据并在本地客户端打印的数据同步作业: ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建连接到sftp的数据源 source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/read/json" file_format_type = "json" plugin_output = "sftp" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(38, 18)" c_timestamp = timestamp c_row = { C_MAP = "map" C_ARRAY = "array" C_STRING = string C_BOOLEAN = boolean C_TINYINT = tinyint C_SMALLINT = smallint C_INT = int C_BIGINT = bigint C_FLOAT = float C_DOUBLE = double C_BYTES = bytes C_DATE = date C_DECIMAL = "decimal(38, 18)" C_TIMESTAMP = timestamp } } } } } # 控制台打印读取的sftp数据 sink { Console { parallelism = 1 } } ``` ### 多表 ```hocon SftpFile { tables_configs = [ { schema { table = "student" fields { name = string age = int } } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" }, { schema { table = "teacher" fields { name = string age = int } } path = "/tmp/seatunnel/sink/text" host = "192.168.31.48" port = 21 user = tyrantlucifer password = tianchao file_format_type = "parquet" } ] } ``` ### 过滤文件 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/read/json" file_format_type = "json" plugin_output = "sftp" // 文件示例 abcD2024.csv file_filter_pattern = "abc[DX]*.*" } } sink { Console { } } ``` ### 增量同步(sync_mode=update,仅 binary) `sync_mode=update` 会对比 source 与 `target_path`,仅读取新增/变更文件。 多数情况下,`target_path` 需要与 sink 的 `path` 对齐(同一文件系统、相同相对路径)。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/update/src" file_format_type = "binary" sync_mode = "update" target_path = "tmp/seatunnel/update/dst" update_strategy = "distcp" compare_mode = "len_mtime" } } sink { SftpFile { host = "sftp" port = 22 user = seatunnel password = pass path = "tmp/seatunnel/update/dst" tmp_path = "tmp/seatunnel/update/tmp" file_format_type = "binary" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Sls.md ================================================ import ChangeLog from '../changelog/connector-sls.md'; # Sls > Sls source connector ## 支持的引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 主要特性 - [x] [batch](../../introduction/concepts/connector-v2-features.md) - [x] [stream](../../introduction/concepts/connector-v2-features.md) - [x] [exactly-once](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [parallelism](../../introduction/concepts/connector-v2-features.md) - [ ] [support user-defined split](../../introduction/concepts/connector-v2-features.md) ## 描述 从阿里云Sls日志服务中读取数据。 ## 支持的数据源信息 为了使用Sls连接器,需要以下依赖关系。 它们可以通过install-plugin.sh或Maven中央存储库下载。 | 数据源 | 支持的版本 | Maven | |-----|-----------|-----------------------------------------------------------------------------------| | Sls | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/connector-sls) | ## Source Options | Name | Type | Required | Default | Description | |-------------------------------------|---------------------------------------------|----------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------| | project | String | Yes | - | [阿里云 Sls 项目](https://help.aliyun.com/zh/sls/user-guide/manage-a-project?spm=a2c4g.11186623.0.0.6f9755ebyfaYSl) | | logstore | String | Yes | - | [阿里云 Sls 日志库](https://help.aliyun.com/zh/sls/user-guide/manage-a-logstore?spm=a2c4g.11186623.0.0.13137c08nfuiBC) | | endpoint | String | Yes | - | [阿里云访问服务点](https://help.aliyun.com/zh/sls/developer-reference/api-sls-2020-12-30-endpoint?spm=a2c4g.11186623.0.0.548945a8UyJULa) | | access_key_id | String | Yes | - | [阿里云访问用户ID](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | access_key_secret | String | Yes | - | [阿里云访问用户密码](https://help.aliyun.com/zh/ram/user-guide/create-an-accesskey-pair?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#task-2245479) | | start_mode | StartMode[earliest],[group_cursor],[latest] | No | group_cursor | 消费者的初始消费模式 | | consumer_group | String | No | SeaTunnel-Consumer-Group | Sls消费者组id,用于区分不同的消费者组 | | auto_cursor_reset | CursorMode[begin],[end] | No | end | 当消费者组中没有记录读取游标时,初始化读取游标 | | batch_size | Int | No | 1000 | 每次从SLS中读取的数据量 | | partition-discovery.interval-millis | Long | No | -1 | 动态发现主题和分区的间隔 | ## 任务示例 ### 简单示例 > 此示例读取sls的logstore1的数据并将其打印到客户端。如果您尚未安装和部署SeaTunnel,则需要按照安装SeaTunnel中的说明安装和部署SeaTunnel。然后按照[快速启动SeaTunnel引擎](../../getting-started/locally/quick-start-seatunnel-engine.md)中的说明运行此作业。 [创建RAM用户及授权](https://help.aliyun.com/zh/sls/create-a-ram-user-and-authorize-the-ram-user-to-access-log-service?spm=a2c4g.11186623.0.i4), 请确认RAM用户有足够的权限来读取及管理数据,参考:[RAM自定义授权示例](https://help.aliyun.com/zh/sls/use-custom-policies-to-grant-permissions-to-a-ram-user?spm=a2c4g.11186623.0.0.4a6e4e554CKhSc#reference-s3z-m1l-z2b) ```hocon # Defining the runtime environment env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 30000 } source { Sls { endpoint = "cn-hangzhou-intranet.log.aliyuncs.com" project = "project1" logstore = "logstore1" access_key_id = "xxxxxxxxxxxxxxxxxxxxxxxx" access_key_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" schema = { fields = { id = "int" name = "string" description = "string" weight = "string" } } } } sink { Console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Snowflake.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Snowflake > JDBC Snowflake 源连接器 > > ## 支持这些引擎 > > Spark
    > Flink
    > SeaTunnel Zeta
    > ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL 并可以实现投影效果。 > ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持的数据源列表 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | snowflake | 不同的依赖版本有不同的驱动类 | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | [下载](https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如 Snowflake 数据源:cp snowflake-connector-java-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ > ## 数据类型映射 | Snowflake 数据类型 | SeaTunnel 数据类型 | |------------------|------------------| | BOOLEAN | BOOLEAN | | TINYINT
    SMALLINT
    BYTEINT | SHORT_TYPE | | INT
    INTEGER | INT | | BIGINT | LONG | | DECIMAL
    NUMERIC
    NUMBER | DECIMAL(x,y) | | DECIMAL(x,y)(>38) | DECIMAL(38,18) | | REAL
    FLOAT4 | FLOAT | | DOUBLE
    DOUBLE PRECISION
    FLOAT8
    FLOAT | DOUBLE | | CHAR
    CHARACTER
    VARCHAR
    STRING
    TEXT
    VARIANT
    OBJECT | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP
    TIMESTAMP_LTZ
    TIMESTAMP_NTZ
    TIMESTAMP_TZ | TIMESTAMP | | BINARY
    VARBINARY | BYTES | | GEOGRAPHY (WKB or EWKB)
    GEOMETRY (WKB or EWKB) | BYTES | | GEOGRAPHY (GeoJSON, WKT or EWKT)
    GEOMETRY (GeoJSON, WKB or EWKB) | STRING | ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:snowflake://.snowflakecomputing.com | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,如果您使用 Snowflake,值为 `net.snowflake.client.jdbc.SnowflakeDriver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行性分割的列名,仅支持数值类型,仅支持数值类型主键,只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分割数量,仅支持正整数。默认值是任务并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当 properties 和 URL 具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 MySQL 中,properties 优先于 URL。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ## 提示 > 如果未设置 partition_column,它将以单并发运行,如果设置了 partition_column,它将根据任务的并发度并行执行。 > > JDBC 驱动程序连接参数在 JDBC 连接字符串中受支持。例如,您可以添加 `?GEOGRAPHY_OUTPUT_FORMAT='EWKT'` 来指定地理空间数据类型。有关可配置参数和地理空间数据类型的更多信息,请访问 Snowflake 官方[文档](https://docs.snowflake.com/en/sql-reference/data-types-geospatial) ## 任务示例 ### 简单 > 此示例在单个并行中查询您的测试"数据库"中的 type_bin 表的 16 条数据,并查询其所有字段。您也可以指定要查询的字段以最终输出到控制台。 ``` # 定义运行时环境 env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 并行 > 使用您配置的分片字段和分片数据并行读取查询表。如果您想读取整个表,可以这样做 ``` Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需要定义查询逻辑 query = "select * from type_bin" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } ``` ### 并行边界 > 指定查询的上下边界内的数据更高效。根据您配置的上下边界读取数据源更高效 ``` Jdbc { url = "jdbc:snowflake://.snowflakecomputing.com" driver = "net.snowflake.client.jdbc.SnowflakeDriver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需要定义查询逻辑 query = "select * from type_bin" partition_column = "id" # 读取开始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 partition_num = 10 } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Socket.md ================================================ import ChangeLog from '../changelog/connector-socket.md'; # Socket > Socket 源连接器 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 用于从 Socket 读取数据。 ## 数据类型映射 文件没有特定的类型列表,我们可以通过在配置中指定 Schema 来指示相应的数据需要转换为哪种 SeaTunnel 数据类型。 | SeaTunnel 数据类型 | |------------------| | STRING | | SHORT | | INT | | BIGINT | | BOOLEAN | | DOUBLE | | DECIMAL | | FLOAT | | DATE | | TIME | | TIMESTAMP | | BYTES | | ARRAY | | MAP | ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | host | String | 是 | - | socket 服务器主机 | | port | Integer | 是 | - | socket 服务器端口 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ## 如何创建 Socket 数据同步作业 * 配置 SeaTunnel 配置文件 以下示例演示如何创建从 Socket 读取数据并在本地客户端上打印的数据同步作业: ```bash # 设置要执行的任务的基本配置 env { parallelism = 1 job.mode = "BATCH" } # 创建源以连接到 socket source { Socket { host = "localhost" port = 9999 } } # 控制台打印读取的 socket 数据 sink { Console { parallelism = 1 } } ``` * 启动端口监听 ```shell nc -l 9999 ``` * 启动 SeaTunnel 任务 * Socket 源发送测试数据 ```text ~ nc -l 9999 test hello flink spark ``` * 控制台 Sink 打印数据 ```text [test] [hello] [flink] [spark] ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/SqlServer-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-sqlserver.md'; # SQL Server CDC > Sql Server CDC 源连接器 ## 支持 SQL Server 版本 - server:2019(或更高版本,仅供参考) ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    ## 主要功能 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义分割](../../introduction/concepts/connector-v2-features.md) ## 描述 Sql Server CDC 连接器允许从 SqlServer 数据库读取快照数据和增量数据。本文档描述了如何设置 Sql Server CDC 连接器来对 SqlServer 数据库运行 SQL 查询。 :::tip 在通过 JDBC 元数据发现表列信息时,SeaTunnel 会按精确的 schema/table 标识符对返回结果做二次过滤,以避免混入其他表的列(部分驱动会将 `schemaPattern`/`tableNamePattern` 视为 SQL LIKE 模式匹配)。对于大小写敏感的数据库,请确保配置的标识符大小写与数据库一致。 ::: ## 支持的数据源信息 | 数据源 | 支持版本 | 驱动 | Url | Maven | | --------- | --------------------------------------------- | -------------------------------------------- | ------------------------------------------------------------- | --------------------------------------------------------------------- | | SqlServer |
  • server:2019(或更高版本,仅供参考)
  • | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433;databaseName=column_type_test | https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc | ## 需要的依赖项 ### 安装 Jdbc 驱动 #### 对于 Spark/Flink 引擎 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已经放置在 `${SEATUNNEL_HOME}/plugins/` 目录中。 #### 对于 SeaTunnel Zeta 引擎 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已经放置在 `${SEATUNNEL_HOME}/lib/` 目录中。 ## 数据类型映射 | SQLserver 数据类型 | SeaTunnel 数据类型 | |----------------------------------------------------------------------|---------------------| | CHAR
    VARCHAR
    NCHAR
    NVARCHAR
    TEXT
    NTEXT
    XML | STRING | | BINARY
    VARBINARY
    IMAGE | BYTES | | INTEGER
    INT | INT | | SMALLINT
    TINYINT | SMALLINT | | BIGINT | BIGINT | | FLOAT(1~24)
    REAL | FLOAT | | DOUBLE
    FLOAT(>24) | DOUBLE | | NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p, s) | | TIMESTAMP | BYTES | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | | BOOLEAN
    BIT
    | BOOLEAN | ## 数据源参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | | ---------------------------------------------- | -------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | username | String | 是 | - | 连接数据库服务器时使用的数据库名称。 | | password | String | 是 | - | 连接数据库服务器时使用的密码。 | | database-names | List | 是 | - | 要监控的数据库名称。 | | table-names | List | 是 | - | 表名是模式名和表名的组合 (databaseName.schemaName.tableName)。 | | table-names-config | List | 否 | - | 表配置列表。例如:[{"table": "db1.schema1.table1","primaryKeys": ["key1"],"snapshotSplitColumn": "key2"}] | | url | String | 是 | - | URL 必须包含数据库,如 "jdbc:sqlserver://localhost:1433;databaseName=test"。 | | startup.mode | Enum | 否 | INITIAL | SqlServer CDC 消费者的可选启动模式,有效枚举为 "initial"、"earliest"、"latest"、"timestamp" 和 "specific"。 | | startup.timestamp | Long | 否 | - | 从指定的纪元时间戳(以毫秒为单位)开始。当 `startup.mode = timestamp` 时,该时间戳会按 `server-time-zone` 转换。
    **注意,当 "startup.mode" 选项使用 `'timestamp'` 时,此选项是必需的。** | | startup.specific-offset.file | String | 否 | - | 从指定的 binlog 文件名开始。
    **注意,当 "startup.mode" 选项使用 `'specific'` 时,此选项是必需的。** | | startup.specific-offset.pos | Long | 否 | - | 从指定的 binlog 文件位置开始。
    **注意,当 "startup.mode" 选项使用 `'specific'` 时,此选项是必需的。** | | stop.mode | Enum | 否 | NEVER | SqlServer CDC 消费者的可选停止模式,有效枚举为 "never"。 | | stop.timestamp | Long | 否 | - | 在指定的纪元时间戳(以毫秒为单位)停止。
    **注意,当 "stop.mode" 选项使用 `'timestamp'` 时,此选项是必需的。** | | stop.specific-offset.file | String | 否 | - | 在指定的 binlog 文件名停止。
    **注意,当 "stop.mode" 选项使用 `'specific'` 时,此选项是必需的。** | | stop.specific-offset.pos | Long | 否 | - | 在指定的 binlog 文件位置停止。
    **注意,当 "stop.mode" 选项使用 `'specific'` 时,此选项是必需的。** | | incremental.parallelism | Integer | 否 | 1 | 增量阶段中并行读取器的数量。 | | snapshot.split.size | Integer | 否 | 8096 | 表快照的分割大小(行数),读取表快照时,捕获的表会被分割为多个分割。 | | snapshot.fetch.size | Integer | 否 | 1024 | 读取表快照时每次轮询的最大获取大小。 | | server-time-zone | String | 否 | UTC | 数据库服务器中的会话时区。该参数也用于将 `startup.timestamp` 转换为 LSN。若数据库时区与 JVM 时区不同,建议显式配置。 | | connect.timeout | Duration | 否 | 30s | 连接器尝试连接到数据库服务器后在超时之前应该等待的最长时间。 | | connect.max-retries | Integer | 否 | 3 | 连接器应该重试建立数据库服务器连接的最大重试次数。 | | connection.pool.size | Integer | 否 | 20 | 连接池大小。 | | chunk-key.even-distribution.factor.upper-bound | Double | 否 | 100 | 分块键分布因子的上界。此因子用于确定表数据是否均匀分布。如果计算的分布因子小于或等于此上界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较大,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 100.0。 | | chunk-key.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 分块键分布因子的下界。此因子用于确定表数据是否均匀分布。如果计算的分布因子大于或等于此下界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较小,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 0.05。 | | sample-sharding.threshold | int | 否 | 1000 | 此配置指定了触发采样分片策略的估计分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估计的分片数(计算为近似行数 / 分块大小)超过此阈值时,将使用采样分片策略。这可以帮助更有效地处理大型数据集。默认值为 1000 分片。 | | inverse-sampling.rate | int | 否 | 1000 | 采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。对于非常大的数据集,首选较低的采样率时,此选项特别有用。默认值为 1000。 | | exactly_once | Boolean | 否 | false | 启用精确一次语义。 | | debezium.* | config | 否 | - | 将 Debezium 的属性传递给 Debezium Embedded Engine,用于捕获来自 SqlServer 服务器的数据变更。
    了解更多关于
    [Debezium 的 SqlServer 连接器属性](https://github.com/debezium/debezium/blob/1.6/documentation/modules/ROOT/pages/connectors/sqlserver.adoc#connector-properties) | | format | Enum | 否 | DEFAULT | SqlServer CDC 的可选输出格式,有效枚举为 "DEFAULT"、"COMPATIBLE_DEBEZIUM_JSON"。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 获取详细信息。 | ### 启用 Sql Server CDC 1. 检查 CDC 代理是否启用 > `EXEC xp_servicecontrol N'querystate', N'SQLServerAGENT';`
    > 如果结果是运行中,证明它已经启用。否则,您需要手动启用它 2. 启用 CDC 代理 > /opt/mssql/bin/mssql-conf setup 3. 结果如下 > 1) 评估版(免费,无生产使用权,180天限制) > 2) 开发者版(免费,无生产使用权) > 3) 快速版(免费) > 4) Web 版(付费) > 5) 标准版(付费) > 6) 企业版(付费) > 7) 企业核心版(付费) > 8) 我通过零售销售渠道购买了许可证,并有产品密钥要输入。 4. 在数据库级别设置 CDC 在下面的数据库级别设置以启用 CDC。在此级别,启用 CDC 的数据库下的所有表都会自动启用 CDC > USE TestDB; -- 替换为实际的数据库名称
    > EXEC sys.sp_cdc_enable_db;
    > SELECT name, is_tracked_by_cdc FROM sys.tables WHERE name = 'table'; -- table 替换为您要检查的表名 ## 任务示例 ### 初始读取简单示例 > 这是一个流模式 CDC,初始化读取表数据,成功读取后将进行增量读取。以下 SQL DDL 仅供参考 ``` env { # 您可以在这里设置引擎配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** SqlServer-CDC { plugin_output = "customers" username = "sa" password = "Y.sa123456" startup.mode="initial" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.full_types"] url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" } } transform { } sink { console { plugin_input = "customers" } } ``` ### 增量读取简单示例 > 这是一个增量读取,读取变更的数据进行打印 ``` env { # 您可以在这里设置引擎配置 parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** SqlServer-CDC { # 设置精确一次读取 exactly_once=true plugin_output = "customers" username = "sa" password = "Y.sa123456" startup.mode="latest" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.full_types"] url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" } } transform { } sink { console { plugin_input = "customers" } } ``` ### 支持表的自定义主键 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { SqlServer-CDC { url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = "sa" password = "Y.sa123456" database-names = ["column_type_test"] table-names = ["column_type_test.dbo.simple_types", "column_type_test.dbo.full_types"] table-names-config = [ { table = "column_type_test.dbo.full_types" primaryKeys = ["id"] } ] } } sink { console { } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/SqlServer.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # SQL Server > JDBC SQL Server 源连接器 ## 支持 SQL Server 版本 - server:2008(或更高版本,仅供参考) ## 支持的引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 需要的依赖项 ### 对于 Spark/Flink 引擎 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已经放置在 `${SEATUNNEL_HOME}/plugins/` 目录中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) 已经放置在 `${SEATUNNEL_HOME}/lib/` 目录中。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义分割](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL 并可以实现投影效果。 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持的数据源信息 | 数据源 | 支持版本 | 驱动 | url | maven | |------------|-------------------------|----------------------------------------------|---------------------------------|-----------------------------------------------------------------------------------| | SQL Server | 支持版本 >= 2008 | com.microsoft.sqlserver.jdbc.SQLServerDriver | jdbc:sqlserver://localhost:1433 | [下载](https://mvnrepository.com/artifact/com.microsoft.sqlserver/mssql-jdbc) | ## 数据库依赖 > 请下载对应 'Maven' 的支持列表,并将其复制到 '$SEATUNNEL_HOME/plugins/jdbc/lib/' 工作目录
    > 例如 SQL Server 数据源:cp mssql-jdbc-xxx.jar $SEATUNNEL_HOME/plugins/jdbc/lib/ ## 数据类型映射 | SQLserver 数据类型 | Seatunnel 数据类型 | |----------------------------------------------------------------------|---------------------| | BIT | BOOLEAN | | TINYINT
    SMALLINT | SMALLINT | | INTEGER
    INT | INT | | BIGINT | BIGINT | | NUMERIC(p,s)
    DECIMAL(p,s)
    MONEY
    SMALLMONEY | DECIMAL(p,s) | | FLOAT(1~24)
    REAL | FLOAT | | DOUBLE
    FLOAT(>24) | DOUBLE | | CHAR
    NCHAR
    VARCHAR
    NTEXT
    NVARCHAR
    TEXT
    XML | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME(s)
    DATETIME2(s)
    DATETIMEOFFSET(s)
    SMALLDATETIME | TIMESTAMP(s) | | BINARY
    VARBINARY
    IMAGE | BYTES | ## 数据源参数 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | | ------------------------------------------ | ------- | -------- | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | url | String | 是 | - | JDBC 连接的 URL。参见示例:jdbc:sqlserver://127.0.0.1:1434;database=TestDB | | driver | String | 是 | - | 用于连接远程数据源的 jdbc 类名,
    如果使用 SQLserver,值为 `com.microsoft.sqlserver.jdbc.SQLServerDriver`。 | | username | String | 否 | - | 连接实例的用户名 | | password | String | 否 | - | 连接实例的密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行度分区的列名,仅支持数值类型。 | | partition_lower_bound | Long | 否 | - | partition_column 扫描的最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | Long | 否 | - | partition_column 扫描的最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分区数量,仅支持正整数。默认值为作业并行度 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,你可以配置
    查询中使用的行获取大小来提高性能,
    通过减少满足选择条件所需的数据库命中次数。
    零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 额外的连接配置参数,当 properties 和 URL 具有相同参数时,优先级由
    驱动的具体实现决定。例如,在 MySQL 中,properties 优先于 URL。 | | use_regex | Boolean | 否 | false | 控制 table_path 的正则表达式匹配。当设置为 `true` 时,table_path 将被视为正则表达式模式。当设置为 `false` 或未指定时,table_path 将被视为精确路径(不进行正则匹配)。 | | table_path | String | 否 | - | 表的完整路径,您可以使用此配置代替 `query`。
    示例:
    "testdb.test_schema.table1" | | table_list | Array | 否 | - | 要读取的表列表,您可以使用此配置代替 `table_path`。示例:```[{ table_path = "testdb.table1"}, {table_path = "testdb.table2", query = "select * id, name from testdb.table2"}]``` | | where_condition | String | 否 | - | 所有表/查询的通用行过滤条件,必须以 `where` 开头。例如 `where id > 100` | | split.size | Int | 否 | 8096 | 表的分割大小(行数),读取表时,捕获的表会被分割为多个分割。 | | split.even-distribution.factor.lower-bound | Double | 否 | 0.05 | 分块键分布因子的下界。此因子用于确定表数据是否均匀分布。如果计算的分布因子大于或等于此下界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较小,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 0.05。 | | split.even-distribution.factor.upper-bound | Double | 否 | 100 | 分块键分布因子的上界。此因子用于确定表数据是否均匀分布。如果计算的分布因子小于或等于此上界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较大,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 100.0。 | | split.sample-sharding.threshold | Int | 否 | 10000 | 此配置指定了触发采样分片策略的估计分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估计的分片数(计算为近似行数 / 分块大小)超过此阈值时,将使用采样分片策略。这可以帮助更有效地处理大型数据集。默认值为 1000 分片。 | | split.inverse-sampling.rate | Int | 否 | 1000 | 采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。对于非常大的数据集,首选较低的采样率时,此选项特别有用。默认值为 1000。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 获取详细信息 | ## 并行读取器 JDBC 源连接器支持从表中并行读取数据。SeaTunnel 将使用某些规则来分割表中的数据,然后将其交给读取器进行读取。读取器的数量由 `parallelism` 选项决定。 **分割键规则:** 1. 如果 `partition_column` 不为空,将使用它来计算分割。该列必须在 **支持的分割数据类型** 中。 2. 如果 `partition_column` 为空,seatunnel 将从表中读取模式并获取主键和唯一索引。如果主键和唯一索引中有多个列,则将使用 **支持的分割数据类型** 中的第一列来分割数据。例如,表具有主键(nn guid, name varchar),因为 `guid` 不在 **支持的分割数据类型** 中,所以将使用 `name` 列来分割数据。 **支持的分割数据类型:** * String * Number(int, bigint, decimal, ...) * Date ### 与分割相关的选项 #### split.size 一个分割中有多少行,读取表时,捕获的表会被分割为多个分割。 #### split.even-distribution.factor.lower-bound > 不推荐使用 分块键分布因子的下界。此因子用于确定表数据是否均匀分布。如果计算的分布因子大于或等于此下界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较小,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 0.05。 #### split.even-distribution.factor.upper-bound > 不推荐使用 分块键分布因子的上界。此因子用于确定表数据是否均匀分布。如果计算的分布因子小于或等于此上界(即,(MAX(id) - MIN(id) + 1) / 行数),表分块将被优化以实现均匀分布。否则,如果分布因子较大,如果估计的分片数超过 `sample-sharding.threshold` 指定的值,表将被视为不均匀分布并使用基于采样的分片策略。默认值为 100.0。 #### split.sample-sharding.threshold 此配置指定了触发采样分片策略的估计分片数阈值。当分布因子超出 `chunk-key.even-distribution.factor.upper-bound` 和 `chunk-key.even-distribution.factor.lower-bound` 指定的范围,并且估计的分片数(计算为近似行数 / 分块大小)超过此阈值时,将使用采样分片策略。这可以帮助更有效地处理大型数据集。默认值为 1000 分片。 #### split.inverse-sampling.rate 采样分片策略中使用的采样率的倒数。例如,如果此值设置为 1000,则意味着在采样过程中应用 1/1000 的采样率。此选项提供了控制采样粒度的灵活性,从而影响最终的分片数量。对于非常大的数据集,首选较低的采样率时,此选项特别有用。默认值为 1000。 #### partition_column [string] 用于分割数据的列名。 #### partition_upper_bound [BigDecimal] partition_column 扫描的最大值,如果未设置,SeaTunnel 将查询数据库获取最大值。 #### partition_lower_bound [BigDecimal] partition_column 扫描的最小值,如果未设置,SeaTunnel 将查询数据库获取最小值。 #### partition_num [int] > 不推荐使用,正确的方法是通过 `split.size` 控制分割数量 我们需要分割为多少个分割,仅支持正整数。默认值为作业并行度。 ## 提示 > 如果表无法分割(例如,表没有主键或唯一索引,且未设置 `partition_column`),将以单个并发运行。 > > 使用 `table_path` 替代 `query` 进行单表读取。如果需要读取多个表,请使用 `table_list`。 ## 任务示例 ### 简单的例子 > 读取数据表的简单单个任务 ``` # 定义运行时环境 env { parallelism = 1 job.mode = "BATCH" } source{ Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from full_types_jdbc" } } transform { # 如果你想了解更多关于如何配置 seatunnel 的信息,并查看转换插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 并行示例 > 使用您配置的分片字段并行读取查询表和分片数据。如果您想读取整个表,可以这样做 ``` env { parallelism = 10 job.mode = "BATCH" } source { Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" # 根据需要定义查询逻辑 query = "select * from full_types_jdbc" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } } transform { # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, # please go to https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 分片并行读取简单示例 > 这是一个快速并行读取数据的分片 ``` env { # 您可以在这里设置引擎配置 parallelism = 10 } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** Jdbc { driver = com.microsoft.sqlserver.jdbc.SQLServerDriver url = "jdbc:sqlserver://localhost:1433;databaseName=column_type_test" username = SA password = "Y.sa123456" query = "select * from column_type_test.dbo.full_types_jdbc" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } # 如果你想了解更多关于如何配置 seatunnel 的信息,并查看源插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/connectors/source/Jdbc } transform { # 如果你想了解更多关于如何配置 seatunnel 的信息,并查看转换插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} # 如果你想了解更多关于如何配置 seatunnel 的信息,并查看汇插件的完整列表, # 请前往 https://seatunnel.apache.org/docs/connectors/sink/Jdbc } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/StarRocks.md ================================================ import ChangeLog from '../changelog/connector-starrocks.md'; # StarRocks > StarRocks 源连接器 ## 描述 通过`StarRocks`读取外部数据源数据。 `StarRocks`源连接器的内部实现是从`FE`获取查询计划, 将查询计划作为参数传递给`BE`节点,然后从`BE`节点获取数据结果。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户定义拆分](../../introduction/concepts/connector-v2-features.md) ## 配置选项 | 名称 | 类型 | 是否必须 | 默认值 | |-------------------------|-----------|------|-------------------| | nodeUrls | list | 是 | - | | username | string | 是 | - | | password | string | 是 | - | | database | string | 是 | - | | table | string | 否 | - | | scan_filter | string | 否 | - | | schema | config | 是 | - | | table_list | array | 否 | - | | request_tablet_size | int | 否 | Integer.MAX_VALUE | | scan_connect_timeout_ms | int | 否 | 30000 | | scan_query_timeout_sec | int | 否 | 3600 | | scan_keep_alive_min | int | 否 | 10 | | scan_batch_rows | int | 否 | 1024 | | scan_mem_limit | long | 否 | 2147483648 | | max_retries | int | 否 | 3 | | scan.params.* | string | 否 | - | ### nodeUrls [list] `StarRocks` 集群地址配置格式 `["fe_ip:fe_http_port", ...]`。 ### username [string] `StarRocks` 用户名称。 ### password [string] `StarRocks` 用户密码。 ### database [string] `StarRocks` 数据库名。 ### table [string] `StarRocks` 表名。 ### scan_filter [string] 过滤查询的表达式,该表达式透明地传输到`StarRocks` 。`StarRocks` 使用此表达式完成源端数据过滤。 例如 ``` "tinyint_1 = 100" ``` ### schema [config] #### fields [Config] 要生成的`starRocks`的`schema`。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 示例 ``` schema { fields { name = string age = int } } ``` ### table_list [array] `StarRocks` 表名列表,当需要同时读取多表时使用此配置代替 table ### request_tablet_size [int] 与分区对应的`StarRocks tablet`的数量。此值设置得越小,生成的分区就越多。这将增加引擎的平行度,但同时也会给`StarRocks`造成更大的压力。 以下示例,用于解释如何使用`request_tablet_size`来控制分区的生成。 ``` StarRocks 集群中表的 tablet 分布作为 follower be_node_1 tablet[1, 2, 3, 4, 5] be_node_2 tablet[6, 7, 8, 9, 10] be_node_3 tablet[11, 12, 13, 14, 15] 1.如果没有设置 request_tablet_size,则单个分区中的 tablet 数量将没有限制。分区将按以下方式生成: partition[0] 从 be_node_1 读取 tablet 数据:tablet[1, 2, 3, 4, 5] partition[1] 从 be_node_2 读取 tablet 数据:tablet[6, 7, 8, 9, 10] partition[2] 从 be_node_3 读取 tablet 数据:tablet[11, 12, 13, 14, 15] 2.如果设置了 request_tablet_size=3,则每个分区中最多包含 3 个 tablet。分区将按以下方式生成 partition[0] 从 be_node_1 读取 tablet 数据:tablet[1, 2, 3] partition[1] 从 be_node_1 读取 tablet 数据:tablet[4, 5] partition[2] 从 be_node_2 读取 tablet 数据:tablet[6, 7, 8] partition[3] 从 be_node_2 读取 tablet 数据:tablet[9, 10] partition[4] 从 be_node_3 读取 tablet 数据:tablet[11, 12, 13] partition[5] 从 be_node_3 读取 tablet 数据:tablet[14,15] ``` ### scan_connect_timeout_ms [int] 发送到 `StarRocks` 的请求连接超时。 ### scan_query_timeout_sec [int] 在 `StarRocks` 中,查询超时时间的默认值为 1 小时,-1 表示没有超时限制。 ### scan_keep_alive_min [int] 查询任务的保持连接时长,单位是分钟,默认值为 10 分钟。我们建议将此参数设置为大于或等于 5 的值。 ### scan_batch_rows [int] 一次从 `BE` 节点读取的最大数据行数。增加此值可以减少引擎与 `StarRocks` 之间建立的连接数量,从而减轻由网络延迟引起的开销。 ### scan_mem_limit [long] 单个查询在 BE 节点上允许的最大内存空间,单位为字节,默认值为 2147483648 字节(即 2 GB)。 ### max_retries [int] 发送到 `StarRocks` 的重试请求次数。 ### scan.params. [string] 从 `BE` 节点扫描数据相关的参数。 ## 示例 1 ``` source { StarRocks { nodeUrls = ["starrocks_e2e:8030"] username = root password = "" database = "test" table = "e2e_table_source" scan_batch_rows = 10 max_retries = 3 schema { fields { BIGINT_COL = BIGINT LARGEINT_COL = STRING SMALLINT_COL = SMALLINT TINYINT_COL = TINYINT BOOLEAN_COL = BOOLEAN DECIMAL_COL = "DECIMAL(20, 1)" DOUBLE_COL = DOUBLE FLOAT_COL = FLOAT INT_COL = INT CHAR_COL = STRING VARCHAR_11_COL = STRING STRING_COL = STRING DATETIME_COL = TIMESTAMP DATE_COL = DATE } } scan.params.scanner_thread_pool_thread_num = "3" } } ``` ## 示例 2: 读取多表 ``` source { StarRocks { nodeUrls = ["starrocks_e2e:8030"] username = root password = "" database = "test" table_list = [ { table = "e2e_table_source" schema = { fields { BIGINT_COL = BIGINT LARGEINT_COL = STRING SMALLINT_COL = SMALLINT TINYINT_COL = TINYINT BOOLEAN_COL = BOOLEAN DECIMAL_COL = "DECIMAL(20, 1)" DOUBLE_COL = DOUBLE FLOAT_COL = FLOAT INT_COL = INT CHAR_COL = STRING VARCHAR_11_COL = STRING STRING_COL = STRING DATETIME_COL = TIMESTAMP DATE_COL = DATE } } }, { table = "e2e_table_source_2" schema = { fields { BIGINT_COL_2 = BIGINT LARGEINT_COL_2 = STRING SMALLINT_COL_2 = SMALLINT TINYINT_COL_2 = TINYINT BOOLEAN_COL_2 = BOOLEAN DECIMAL_COL_2 = "DECIMAL(20, 1)" DOUBLE_COL_2 = DOUBLE FLOAT_COL_2 = FLOAT INT_COL_2 = INT CHAR_COL_2 = STRING VARCHAR_11_COL_2 = STRING STRING_COL_2 = STRING DATETIME_COL_2 = TIMESTAMP DATE_COL_2 = DATE } } }] scan_batch_rows = 10 max_retries = 3 scan.params.scanner_thread_pool_thread_num = "3" } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/TDengine.md ================================================ import ChangeLog from '../changelog/connector-tdengine.md'; # TDengine > TDengine 源端连接器 ## 描述 通过 TDengine 读取外部数据源的数据。 ## 主要特性 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流式](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) 支持查询 SQL,并可实现投影效果。 - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义分片](../../introduction/concepts/connector-v2-features.md) ## 配置项 | 名称 | 类型 | 必填 | 默认值 | |----------------|--------|------|----------------| | url | string | 是 | - | | username | string | 是 | - | | password | string | 是 | - | | database | string | 是 | | | stable | string | 是 | - | | sub_tables | list | 否 | - | | lower_bound | long | 是 | - | | upper_bound | long | 是 | - | | read_columns | list | 否 | - | ### url [string] 选择 TDengine 时的连接 URL 例如: ``` jdbc:TAOS-RS://localhost:6041/ ``` ### username [string] 选择 TDengine 时的用户名 ### password [string] 选择 TDengine 时的密码 ### database [string] 选择 TDengine 时的数据库名 ### stable [string] 选择 TDengine 时的超级表名 ### sub_tables [list] TDengine 的子表名。如果不指定,则会选择所有子表;如果指定,则只选择指定的子表。 ### lower_bound [long] 迁移时间段的下界 ### upper_bound [long] 迁移时间段的上界 ### read_columns [list] 选择 TDengine 时的列名。如果不指定,则选择所有字段;如果指定,则只选择指定的字段。读取超级表时,请包含TAGS 字段,并放在末尾。 ## 示例 ### source 配置示例 ```hocon source { TDengine { url : "jdbc:TAOS-RS://localhost:6041/" username : "root" password : "taosdata" database : "power" stable : "meters" sub_tables : ["meter_1","meter_2"] lower_bound : "2018-10-03 14:38:05.000" upper_bound : "2018-10-03 14:38:16.800" plugin_output = "tdengine_result" read_columns : ["ts","voltage","current","power"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Tablestore.md ================================================ import ChangeLog from '../changelog/connector-tablestore.md'; # Tablestore > Tablestore 源连接器 ## 描述 从阿里云 Tablestore 读取数据,支持全量和 CDC。 ## 关键特性 - [ ] [批](../../introduction/concepts/connector-v2-features.md) - [X] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |-------------------|--------|----|-----|---------------------------------------------------------------------------| | end_point | string | 是 | - | Tablestore 的端点 | | instance_name | string | 是 | - | Tablestore 的实例名称 | | access_key_id | string | 是 | - | Tablestore 的访问 ID | | access_key_secret | string | 是 | - | Tablestore 的访问密钥 | | table | string | 是 | - | Tablestore 的表名 | | primary_keys | array | 是 | - | 表的主键,只需添加一个唯一的主键 | | schema | config | 是 | - | 数据的结构。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 | ### end_point [string] Tablestore 的端点。 ### instance_name [string] Tablestore 的实例名称。 ### access_key_id [string] Tablestore 的访问 ID。 ### access_key_secret [string] Tablestore 的访问密钥。 ### table [string] Tablestore 的表名。 ### primary_keys [array] 表的主键,只需添加一个唯一的主键。 ### schema [Config] 数据的结构。更多详情请参考 [Schema 特性](../../introduction/concepts/schema-feature.md)。 ## 示例 ```bash env { parallelism = 1 job.mode = "STREAMING" } source { # 这是一个示例源插件 **仅用于测试和演示源插件功能** Tablestore { end_point = "https://****.cn-zhangjiakou.tablestore.aliyuncs.com" instance_name = "****" access_key_id="***************2Ag5" access_key_secret="***********2Dok" table="test" primary_keys=["id"] schema={ fields { id = string name = string } } } } sink { MongoDB{ uri = "mongodb://localhost:27017" database = "test" collection = "test" primary-key = ["id"] schema = { fields { id = string name = string } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/TiDB-CDC.md ================================================ import ChangeLog from '../changelog/connector-cdc-tidb.md'; # TiDB CDC > TiDB CDC模式的连接器 ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    ## 主要功能 - [ ] [批处理](../../introduction/concepts/connector-v2-features.md) - [x] [流处理](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [column projection](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 描述 TiDB-CDC连接器允许从 TiDB 数据库读取快照数据和增量数据。本文将介绍如何设置 TiDB-CDC 连接器,在 TiDB 数据库中对数据进行快照和捕获流事件。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | Maven | |------------------|------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------------------------------------------| | MySQL |
  • [MySQL](https://dev.mysql.com/doc): 5.5, 5.6, 5.7, 8.0.x
  • [RDS MySQL](https://www.aliyun.com/product/rds/mysql): 5.6, 5.7, 8.0.x
  • | com.mysql.cj.jdbc.Driver | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 | | tikv-client-java | 3.2.0 | - | https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0 | ## Using Dependency ### 安装驱动 #### 在 Flink 引擎下 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 和 [tikv-client-java jar 包](https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0) 已经放在目录 `${SEATUNNEL_HOME}/plugins/`。 #### 在 SeaTunnel Zeta 引擎下 > 1. 你需要确保 [jdbc 驱动 jar 包](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 和 [tikv-client-java jar 包](https://mvnrepository.com/artifact/org.tikv/tikv-client-java/3.2.0) 已经放在目录 `${SEATUNNEL_HOME}/lib/` 。 请下载 Mysql 驱动和 tikv-java-client 并将其放在 `${SEATUNNEL_HOME}/lib/` 目录中。例如: ```bash cp mysql-connector-java-xxx.jar ${SEATUNNEL_HOME}/lib/ ``` ## 数据类型映射 | Mysql 数据类型 | SeaTunnel 数据类型 | |------------------------------------------------------------------------------------------------|----------------| | BIT(1)
    TINYINT(1) | BOOLEAN | | TINYINT | TINYINT | | TINYINT UNSIGNED
    SMALLINT | SMALLINT | | SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | BIGINT | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(p, s)
    DECIMAL(p, s) UNSIGNED
    NUMERIC(p, s)
    NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED
    REAL
    REAL UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    ENUM
    JSON
    ENUM | STRING | | DATE | DATE | | TIME(s) | TIME(s) | | DATETIME
    TIMESTAMP(s) | TIMESTAMP(s) | | BINARY
    VARBINAR
    BIT(p)
    TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    GEOMETRY | BYTES | ## 源选项 | 名称 | 类型 | 必需 | 默认 | 描述 | |-------------------------|---------|----|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | url | String | 是 | - | JDBC 连接的 URL,例如:`jdbc:mysql://tidb0:4000/inventory`。 | | username | String | 是 | - | 连接数据库服务器时使用的用户名。 | | password | String | 是 | - | 连接数据库服务器时使用的密码。 | | pd-addresses | String | 是 | - | TiKV 集群的 PD 地址。 | | database-name | String | 是 | - | 要监控的数据库名称。 | | table-name | String | 是 | - | 要监控的表名称。表名称需要包含数据库名称。 | | startup.mode | Enum | 否 | INITIAL | TiDB CDC 消费器的可选启动模式,可选值有 `initial`、`earliest`、`latest` 和 `specific`。
    `initial`:启动时同步历史数据,然后同步增量数据。
    `earliest`:从最早的可用偏移量开始启动。
    `latest`:从最新的偏移量开始启动。
    `specific`:从用户提供的特定偏移量开始启动。 | | batch-size-per-scan | Int | 否 | 1000 | 每次扫描的大小。 | | tikv.grpc.timeout_in_ms | Long | 否 | - | TiKV GRPC 超时时间(毫秒)。 | | tikv.grpc.scan_timeout_in_ms | Long | 否 | - | TiKV GRPC 扫描超时时间(毫秒)。 | | tikv.batch_get_concurrency | Integer | 否 | - | TiKV GRPC 批量获取并发度。 | | tikv.batch_scan_concurrency | Integer | 否 | - | TiKV GRPC 批量扫描并发度。 | ## 任务示例 ### 简单示例 ``` env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { TiDB-CDC { plugin_output = "products_tidb_cdc" url = "jdbc:mysql://tidb0:4000/inventory" driver = "com.mysql.cj.jdbc.Driver" tikv.grpc.timeout_in_ms = 20000 pd-addresses = "pd0:2379" username = "root" password = "" database-name = "inventory" table-name = "products" } } transform { } sink { jdbc { plugin_input = "products_tidb_cdc" url = "jdbc:mysql://tidb0:4000/inventory" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "" database = "inventory" table = "products_sink" generate_sink_sql = true primary_keys = ["id"] } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Typesense.md ================================================ import ChangeLog from '../changelog/connector-typesense.md'; # Typesense > Typesense 源连接器 ## 描述 从 Typesense 读取数据。 ## 主要功能 - [x] [批处理](../../introduction/concepts/connector-v2-features.md) - [ ] [流处理](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [Schema](../../introduction/concepts/connector-v2-features.md) - [x] [并行度](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户定义的拆分](../../introduction/concepts/connector-v2-features.md) ## 选项 | 名称 | 类型 | 必填 | 默认值 | |------------|--------|----|-----| | hosts | array | 是 | - | | collection | string | 是 | - | | schema | config | 是 | - | | api_key | string | 否 | - | | query | string | 否 | - | | batch_size | int | 否 | 100 | ### hosts [array] Typesense的访问地址,格式为 `host:port`,例如:["typesense-01:8108"] ### collection [string] 要写入的集合名,例如:“seatunnel” ### schema [config] typesense 需要读取的列。有关更多信息,请参阅:[guide](../../introduction/concepts/schema-feature.md#how-to-declare-type-supported)。 ### api_key [config] typesense 安全认证的 api_key。 ### batch_size 读取数据时,每批次查询数量 ### 常用选项 Source 插件常用参数,具体请参考 [Source 常用选项](../common-options/source-common-options.md) ## 示例 ```bash source { Typesense { hosts = ["localhost:8108"] collection = "companies" api_key = "xyz" query = "q=*&filter_by=num_employees:>9000" schema = { fields { company_name_list = array company_name = string num_employees = long country = string id = string c_row = { c_int = int c_string = string c_array_int = array } } } } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Vertica.md ================================================ import ChangeLog from '../changelog/connector-jdbc.md'; # Vertica > JDBC Vertica 源连接器 ## 描述 通过 JDBC 读取外部数据源数据。 ## 支持这些引擎 > Spark
    > Flink
    > SeaTunnel Zeta
    ## 使用依赖 ### 对于 Spark/Flink 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://www.vertica.com/download/vertica/client-drivers/) 已放置在目录 `${SEATUNNEL_HOME}/plugins/` 中。 ### 对于 SeaTunnel Zeta 引擎 > 1. 您需要确保 [jdbc 驱动程序 jar 包](https://www.vertica.com/download/vertica/client-drivers/) 已放置在目录 `${SEATUNNEL_HOME}/lib/` 中。 ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [ ] [流](../../introduction/concepts/connector-v2-features.md) - [x] [精确一次](../../introduction/concepts/connector-v2-features.md) - [x] [列投影](../../introduction/concepts/connector-v2-features.md) - [x] [并行性](../../introduction/concepts/connector-v2-features.md) - [x] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) > 支持查询 SQL 并可以实现投影效果。 ## 支持的数据源信息 | 数据源 | 支持的版本 | 驱动 | 连接串 | Maven | |--------|-----------|------|--------|-------| | Vertica | 不同的依赖版本有不同的驱动类 | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [下载](https://www.vertica.com/download/vertica/client-drivers/) | ## 数据类型映射 | Vertica 数据类型 | SeaTunnel 数据类型 | |-----------------|------------------| | BIT | BOOLEAN | | TINYINT
    TINYINT UNSIGNED
    SMALLINT
    SMALLINT UNSIGNED
    MEDIUMINT
    MEDIUMINT UNSIGNED
    INT
    INTEGER
    YEAR | INT | | INT UNSIGNED
    INTEGER UNSIGNED
    BIGINT | LONG | | BIGINT UNSIGNED | DECIMAL(20,0) | | DECIMAL(x,y)(<38) | DECIMAL(x,y) | | DECIMAL(x,y)(>38) | DECIMAL(38,18) | | DECIMAL UNSIGNED | DECIMAL | | FLOAT
    FLOAT UNSIGNED | FLOAT | | DOUBLE
    DOUBLE UNSIGNED | DOUBLE | | CHAR
    VARCHAR
    TINYTEXT
    MEDIUMTEXT
    TEXT
    LONGTEXT
    JSON | STRING | | DATE | DATE | | TIME | TIME | | DATETIME
    TIMESTAMP | TIMESTAMP | | TINYBLOB
    MEDIUMBLOB
    BLOB
    LONGBLOB
    BINARY
    VARBINAR
    BIT(n) | BYTES | | GEOMETRY
    UNKNOWN | 暂不支持 | ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | JDBC 连接的 URL。参考示例:jdbc:vertica://localhost:5433/vertica | | driver | String | 是 | - | 用于连接到远程数据源的 jdbc 类名,如果您使用 Vertica,值为 `com.vertica.jdbc.Driver`。 | | username | String | 否 | - | 连接实例用户名 | | password | String | 否 | - | 连接实例密码 | | query | String | 是 | - | 查询语句 | | connection_check_timeout_sec | Int | 否 | 30 | 等待用于验证连接的数据库操作完成的时间(秒) | | partition_column | String | 否 | - | 用于并行性分割的列名,仅支持数值类型,仅支持数值类型主键,只能配置一列。 | | partition_lower_bound | BigDecimal | 否 | - | partition_column 的最小值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最小值。 | | partition_upper_bound | BigDecimal | 否 | - | partition_column 的最大值用于扫描,如果未设置,SeaTunnel 将查询数据库获取最大值。 | | partition_num | Int | 否 | job parallelism | 分割数量,仅支持正整数。默认值是任务并行度。 | | fetch_size | Int | 否 | 0 | 对于返回大量对象的查询,您可以配置查询中使用的行提取大小,以通过减少满足选择条件所需的数据库命中次数来提高性能。零表示使用 jdbc 默认值。 | | properties | Map | 否 | - | 其他连接配置参数,当 properties 和 URL 具有相同参数时,优先级由驱动程序的具体实现确定。例如,在 MySQL 中,properties 优先于 URL。 | | common-options | | 否 | - | 源插件通用参数,请参考 [源通用选项](../common-options/source-common-options.md) 详见。 | ### 提示 > 如果未设置 partition_column,它将以单并发运行,如果设置了 partition_column,它将根据任务的并发度并行执行。 ## 任务示例 ### 简单 > 此示例在单个并行中查询您的测试"数据库"中的 type_bin 表的 16 条数据,并查询其所有字段。您也可以指定要查询的字段以最终输出到控制台。 ``` # 定义运行时环境 env { parallelism = 2 job.mode = "BATCH" } source{ Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" query = "select * from type_bin limit 16" } } transform { # 如果您想了解有关如何配置 seatunnel 的更多信息并查看完整的转换插件列表, # 请访问 https://seatunnel.apache.org/docs/transforms/sql } sink { Console {} } ``` ### 并行 > 使用您配置的分片字段和分片数据并行读取查询表。如果您想读取整个表,可以这样做 ``` source { Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需要定义查询逻辑 query = "select * from type_bin" # 并行分片读取字段 partition_column = "id" # 分片数量 partition_num = 10 } } ``` ### 并行边界 > 指定查询的上下边界内的数据更高效。根据您配置的上下边界读取数据源更高效 ``` source { Jdbc { url = "jdbc:vertica://localhost:5433/vertica" driver = "com.vertica.jdbc.Driver" connection_check_timeout_sec = 100 username = "root" password = "123456" # 根据需要定义查询逻辑 query = "select * from type_bin" partition_column = "id" # 读取开始边界 partition_lower_bound = 1 # 读取结束边界 partition_upper_bound = 500 partition_num = 10 } } ``` ## 变更日志 ================================================ FILE: docs/zh/connectors/source/Web3j.md ================================================ import ChangeLog from '../changelog/connector-web3j.md'; # Web3j > Web3j 源连接器 ## 支持这些引擎 > Spark
    > Flink
    > Seatunnel Zeta
    ## 关键特性 - [x] [批](../../introduction/concepts/connector-v2-features.md) - [x] [流](../../introduction/concepts/connector-v2-features.md) - [ ] [精确一次](../../introduction/concepts/connector-v2-features.md) - [ ] [列投影](../../introduction/concepts/connector-v2-features.md) - [ ] [并行性](../../introduction/concepts/connector-v2-features.md) - [ ] [支持用户自定义split](../../introduction/concepts/connector-v2-features.md) ## 描述 Web3j 的源连接器。用于从区块链读取数据,例如区块信息、交易、智能合约事件等。目前支持读取区块高度数据。 ## 源选项 | 参数名 | 类型 | 必须 | 默认值 | 描述 | |--------|------|------|--------|------| | url | String | 是 | - | 使用 Infura 作为服务提供商时,URL 用于与以太坊网络通信。 | ## 如何创建 Http 数据同步作业 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { Web3j { url = "https://mainnet.infura.io/v3/xxxxx" } } # 控制台打印读取的 Http 数据 sink { Console { parallelism = 1 } } ``` 然后您将获得以下数据: ```json {"blockNumber":19525949,"timestamp":"2024-03-27T13:28:45.605Z"} ``` ## 变更日志 ================================================ FILE: docs/zh/developer/coding-guide.md ================================================ # 编码指南 本指南整体介绍了当前 Apache SeaTunnel 的模块和提交一个高质量 pull request 的最佳实践。 ## 模块概述 | 模块名 | 介绍 | |----------------------------------------|--------------------------------------------------------------------| | seatunnel-api | SeaTunnel connector V2 API 模块 | | seatunnel-common | SeaTunnel 通用模块 | | seatunnel-connectors-v2 | SeaTunnel connector V2 模块, connector V2 处于社区重点开发中 | | seatunnel-core/seatunnel-spark-starter | SeaTunnel connector V2 的 Spark 引擎核心启动模块 | | seatunnel-core/seatunnel-flink-starter | SeaTunnel connector V2 的 Flink 引擎核心启动模块 | | seatunnel-core/seatunnel-starter | SeaTunnel connector V2 的 SeaTunnel 引擎核心启动模块 | | seatunnel-e2e | SeaTunnel 端到端测试模块 | | seatunnel-examples | SeaTunnel 本地案例模块, 开发者可以用来单元测试和集成测试 | | seatunnel-engine | SeaTunnel 引擎模块, seatunnel-engine 是 SeaTunnel 社区新开发的计算引擎,用来实现数据同步 | | seatunnel-formats | SeaTunnel 格式化模块,用来提供格式化数据的能力 | | seatunnel-plugin-discovery | SeaTunnel 插件发现模块,用来加载类路径中的SPI插件 | | seatunnel-transforms-v2 | SeaTunnel transform V2 模块, transform V2 处于社区重点开发中 | | seatunnel-translation | SeaTunnel translation 模块, 用来适配Connector V2 和其他计算引擎, 例如Spark、Flink等 | ## 如何提交一个高质量的Pull Request 1. 创建实体类的时候使用 `lombok` 插件的注解(`@Data` `@Getter` `@Setter` `@NonNull` 等)来减少代码量。在编码过程中优先使用 lombok 插件是一个很好的习惯。 2. 如果你需要在类中使用 log4j 打印日志, 优先使用 `lombok` 中的 `@Slf4j` 注解。 3. SeaTunnel 使用 Github issue 来跟踪代码问题,包括 bugs 和 改进, 并且使用 Github pull request 来管理代码的审查和合并。所以创建一个清晰的 issue 或者 pull request 能让社区更好的理解开发者的意图,最佳实践如下: > [目的] [模块名称] [子模块名称] 描述 1. Pull request 目的包含: `Hotfix`, `Feature`, `Improve`, `Docs`, `WIP`。 请注意如果选择 `WIP`, 你需要使用 github 的 draft pull request。 2. Issue 目的包含: `Feature`, `Bug`, `Docs`, `Discuss`。 3. 模块名称: 当前 pull request 或 issue 所涉及的模块名称, 例如: `Core`, `Connector-V2`, `Connector-V1`等。 4. 子模块名称: 当前 pull request 或 issue 所涉及的子模块名称, 例如:`File` `Redis` `Hbase`等。 5. 描述: 高度概括下当前 pull request 和 issue 要做的事情,尽量见名知意。 提示:**更多内容, 可以参考 [Issue Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#issue) 和 [Pull Request Guide](https://seatunnel.apache.org/community/contribution_guide/contribute#pull-request)** 4. 代码片段不要重复。 如果一段代码被使用多次,定义多次不是好的选择,最佳实践是把它公共独立出来让其他模块使用。 5. 当抛出一个异常时, 需要一起带上提示信息并且使异常的范围尽可能地小。抛出过于广泛的异常会让错误处理变得复杂并且容易包含安全问题。例如,如果你的 connector 在读数据的时候遇到 `IOException`, 合理的做法如下: ```java try { // read logic } catch (IOException e) { throw SeaTunnelORCFormatException("This orc file is corrupted, please check it", e); } ``` 6. Apache 项目的 license 要求很严格, 每个 Apache 项目文件都应该包含一个 license 声明。 在提交 pull request 之前请检查每个新文件都包含 `Apache License Header`。 ```java /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ ``` 7. Apache SeaTunnel 使用 `Spotless` 管理代码风格和格式检查。你可以使用下面的命令来自动修复代码风格问题和格式。 ```shell ./mvnw spotless:apply ``` 8. 提交 pull request 之前,确保修改后项目编译正常,使用下面命令打包整个项目: ```shell # 多线程编译 ./mvnw -T 1C clean package ``` ```shell # 单线程编译 ./mvnw clean package ``` 9. 提交 pull request 之前,在本地用完整的单元测试和集成测试来检查你的功能性是否正确,最佳实践是用 `seatunnel-examples` 模块的例子去检查多引擎是否正确运行并且结果正确。 10. 如果提交的 pull request 是一个新的特性, 请记得更新文档。 11. 提交 connector 相关的 pull request, 可以通过写 e2e 测试保证鲁棒性,e2e 测试需要包含所有的数据类型,并且初始化尽可能小的 docker 镜像,sink 和 source 的测试用例可以写在一起减少资源的损耗。 可以参考这个不错的例子: [MongodbIT.java](https://github.com/apache/seatunnel/blob/dev/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java) 12. 类中默认的权限需要使用 `private`, 不可修改的需要设置 `final`, 特殊场景除外。 13. 类中的属性和方法参数倾向于使用基本数据类型(int boolean double float...), 而不是包装类型(Integer Boolean Double Float...), 特殊情况除外。 14. 开发一个 sink connector 的时候你需要知道 sink 需要被序列化,如果有不能被序列化的属性, 需要包装到一个类中,并且使用单例模式。 15. 如果代码中有多个 `if` 流程判断, 尽量简化为多个 if 而不是 if-else-if。 16. Pull request 具有单一职责的特点, 不允许在 pull request 包含与该功能无关的代码, 如果有这种情况, 需要在提交 pull request 之前单独处理好, 否则 Apache SeaTunnel 社区会主动关闭 pull request。 17. 贡献者需要对自己的 pull request 负责。 如果 pull request 包含新的特性, 或者修改了老的特性,增加测试用例或者 e2e 用例来证明合理性和保护完整性是一个很好的做法。 18. 如果你认为社区当前某部分代码不合理(尤其是核心的 `core` 和 `api` 模块),有函数需要更新修改,优先使用 `discuss issue` 和 `email` 与社区讨论是否有必要修改,社区同意后再提交 pull request, 请不要不经讨论直接提交 pull request, 社区会认为无效并且关闭。 ================================================ FILE: docs/zh/developer/contribute-plugin.md ================================================ # 贡献 Connector-V2 插件 如果你想要贡献 Connector-V2, 可以参考下面的 Connector-V2 贡献指南。 可以帮你快速进入开发。 [Connector-v2 贡献指南](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.zh.md) ================================================ FILE: docs/zh/developer/contribute-transform-v2-guide.md ================================================ # 贡献 Transform-V2 插件 如果你想要贡献 Transform-V2, 可以参考下面的 Transform-V2 贡献指南。 可以帮你快速进入开发。 [Connector-v2 贡献指南](https://github.com/apache/seatunnel/blob/dev/seatunnel-transforms-v2/README.zh.md) ================================================ FILE: docs/zh/developer/docs-format-specification.md ================================================ # 文档格式规范 ## 注释说明 注释说明在技术文档中起强调作用。在使用中,需遵循以下规范: - 根据提示内容,可以将注释分为“提示”、“备注”、“注意”三类。注释框标题与使用场景请遵循以下规范: - 提示:主要用于操作技巧提示 - 备注:用于补充内容补充解释 - 注意:用于操作、注意事项警告 - 提示框内容可以使用有序、无序、代码块 下面是 Markdown 文档中注释说明示例: ```Markdown :::tip 提示 这是一条提示 ::: :::info 备注 这是一条备注 ::: :::caution 注意 这是一条注意事项 ::: ``` ================================================ FILE: docs/zh/developer/how-to-create-your-connector.md ================================================ # 开发自己的Connector 如果你想针对SeaTunnel新的连接器API开发自己的连接器(Connector V2),请查看[这里](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.zh.md) 。 ## 架构文档参考 如需了解 SeaTunnel 的 API 设计和引擎架构的详细信息,请参阅: - [架构概览](../architecture/overview.md) - 整体架构和设计原则 - [数据源架构](../architecture/api-design/source-architecture.md) - Source API 设计深入剖析 - [数据汇架构](../architecture/api-design/sink-architecture.md) - Sink API 设计深入剖析 - [转换层](../architecture/api-design/translation-layer.md) - 连接器如何在不同引擎上工作 - [检查点机制](../architecture/fault-tolerance/checkpoint-mechanism.md) - 容错和状态管理 这些文档将帮助你理解 SeaTunnel 连接器中使用的底层架构和设计模式。 ================================================ FILE: docs/zh/developer/new-license.md ================================================ # 如何添加新的 License ### ASF 第三方许可政策 如果您打算向SeaTunnel(或其他Apache项目)添加新功能,并且该功能涉及到其他开源软件引用的时候,请注意目前 Apache 项目支持遵从以下协议的开源软件。 [ASF 第三方许可政策](https://apache.org/legal/resolved.html) 如果您所使用的第三方软件并不在以上协议之中,那么很抱歉,您的代码将无法通过审核,建议您找寻其他替代方案。 ### 如何在 SeaTunnel 中合法使用第三方开源软件 当我们想要引入一个新的第三方软件(包含但不限于第三方的 jar、文本、CSS、js、图片、图标、音视频等及在第三方基础上做的修改)至我们的项目中的时候,除了他们所遵从的协议是 Apache 允许的,另外一点很重要,就是合法的使用。您可以参考以下文章 * [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) 举个例子,当我们使用了 ZooKeeper,那么我们项目就必须包含 ZooKeeper 的 NOTICE 文件(每个开源项目都会有 NOTICE 文件,一般位于根目录),用Apache的话来讲,就是 "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work. 关于具体的各个开源协议使用协议,在此不做过多篇幅一一介绍,有兴趣可以自行查询了解。 ### SeaTunnel-License 检测规则 通常情况下, 我们会为项目添加 License-check 脚本。 跟其他开源项目略有不同,SeaTunnel 使用 [SkyWalking](https://github.com/apache/skywalking) 提供的 SeaTunnel-License-Check。 总之,我们试图第一时间避免 License 问题。 当我们需要添加新的 jar 包或者使用外部资源时, 我们需要按照以下步骤进行操作: * 在 known-dependencies.txt 文件中添加 jar 的名称和版本 * 在 'seatunnel-dist/release-docs/LICENSE' 目录下添加相关 maven 仓库地址 * 在 'seatunnel-dist/release-docs/NOTICE' 目录下添加相关的 NOTICE 文件, 并确保他们跟原来的仓库中的文件没有区别 * 在 'seatunnel-dist/release-docs/licenses' 目录下添加相关源码协议文件, 并且文件命令遵守 license-filename.txt 规则。 例:license-zk.txt * 检查依赖的 license 是否出错 ``` --- /dev/fd/63 2020-12-03 03:08:57.191579482 +0000 +++ /dev/fd/62 2020-12-03 03:08:57.191579482 +0000 @@ -1,0 +2 @@ +HikariCP-java6-2.3.13.jar @@ -16,0 +18 @@ +c3p0-0.9.5.2.jar @@ -149,0 +152 @@ +mchange-commons-java-0.2.11.jar - commons-lang-2.1.3.jar Error: Process completed with exit code 1. ``` 一般来说,添加一个 jar 的工作通常不是很容易,因为 jar 通常依赖其他各种 jar, 我们还需要为这些 jar 添加相应的许可证。 在这种情况下, 我们会收到检查 license 失败的错误信息。像上面的例子,我们缺少 `HikariCP-java6-2.3.13`, `c3p0` 等的 license 声明(`+` 表示新添加,`-` 表示需要删除), 按照步骤添加 jar。 ### 参考 * [COMMUNITY-LED DEVELOPMENT "THE APACHE WAY"](https://apache.org/dev/licensing-howto.html) * [ASF 第三方许可政策](https://apache.org/legal/resolved.html) ================================================ FILE: docs/zh/developer/setup.md ================================================ # 搭建开发环境 在这个章节, 我们会向你展示如何搭建 SeaTunnel 的开发环境, 然后用 JetBrains IntelliJ IDEA 跑一个简单的示例。 > 你可以用任何你喜欢的开发环境进行开发和测试,我们只是用 [JetBrains IDEA](https://www.jetbrains.com/idea/) > 作为示例来展示如何一步步完成设置。 ## 准备 在设置开发环境之前, 需要做一些准备工作, 确保你安装了以下软件: * 安装 [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git)。 * 安装 [Java](https://www.java.com/en/download/) (目前只支持 JDK8/JDK11) 并且设置 `JAVA_HOME` 环境变量。 * 安装 [Scala](https://www.scala-lang.org/download/2.11.12.html) (目前只支持 scala 2.11.12)。 * 安装 [JetBrains IDEA](https://www.jetbrains.com/idea/)。 ## 设置 ### 克隆源码 首先使用以下命令从 [GitHub](https://github.com/apache/seatunnel) 克隆 SeaTunnel 源代码。 ```shell git clone git@github.com:apache/seatunnel.git ``` ### 本地安装子项目 在克隆好源代码以后, 运行 `./mvnw` 命令安装子项目到 maven 本地仓库目录。 否则你的代码无法在 IDEA 中正常启动。 ```shell ./mvnw clean install -DskipTests ``` ### 源码编译 在安装 maven 以后, 可以使用下面命令进行编译和打包。 ``` mvn clean package -pl seatunnel-dist -am -Dmaven.test.skip=true ``` ### 编译子模块 如果要单独编译子模块, 可以使用下面的命令进行编译和打包。 ```ssh # 这是一个单独构建 redis connector 的示例 mvn clean package -pl seatunnel-connectors-v2/connector-redis -am -DskipTests -T 1C ``` ### 安装 JetBrains IDEA Scala 插件 用 JetBrains IntelliJ IDEA 打开你的源码,如果有 Scala 的代码,则需要安装 JetBrains IntelliJ IDEA's [Scala plugin](https://plugins.jetbrains.com/plugin/1347-scala)。 可以参考 [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) 。 ### 安装 JetBrains IDEA Lombok 插件 在运行示例之前, 安装 JetBrains IntelliJ IDEA 的 [Lombok plugin](https://plugins.jetbrains.com/plugin/6317-lombok)。 可以参考 [install plugins for IDEA](https://www.jetbrains.com/help/idea/managing-plugins.html#install-plugins) 。 ### 代码风格 Apache SeaTunnel 使用 `Spotless` 来统一代码风格和格式检查。可以运行下面 `Spotless` 命令自动格式化。 ```shell ./mvnw spotless:apply ``` 拷贝 `pre-commit hook` 文件 `/tools/spotless_check/pre-commit.sh` 到你项目的 `.git/hooks/` 目录, 这样每次你使用 `git commit` 提交代码的时候会自动调用 `Spotless` 修复格式问题。 ## 运行一个简单的示例 完成上面所有的工作后,环境搭建已经完成, 可以直接运行我们的示例了。 所有的示例在 `seatunnel-examples` 模块里, 你可以随意选择进行编译和调试,参考 [running or debugging it in IDEA](https://www.jetbrains.com/help/idea/run-debug-configuration.html)。 我们使用 `seatunnel-examples/seatunnel-engine-examples/src/main/java/org/apache/seatunnel/example/engine/SeaTunnelEngineLocalExample.java` 作为示例, 运行成功后的输出如下: ```log 2024-08-10 11:45:32,839 INFO org.apache.seatunnel.core.starter.seatunnel.command.ClientExecuteCommand - *********************************************** Job Statistic Information *********************************************** Start Time : 2024-08-10 11:45:30 End Time : 2024-08-10 11:45:32 Total Time(s) : 2 Total Read Count : 5 Total Write Count : 5 Total Failed Count : 0 *********************************************** ``` ## 更多信息 所有的实例都用了简单的 source 和 sink, 这样可以使得运行更独立和更简单。 你可以修改 `resources/examples` 中的示例的配置。 例如下面的配置使用 PostgreSQL 作为源,并且输出到控制台。 请注意引用FakeSource 和 Console 以外的连接器时,需要修改seatunnel-example对应子模块下的`pom.xml`文件中的依赖。 ```conf env { parallelism = 1 job.mode = "BATCH" } source { Jdbc { driver = org.postgresql.Driver url = "jdbc:postgresql://host:port/database" user = "postgres" password = "123456" query = "select * from test" table_path = "database.test" } } sink { Console {} } ``` ================================================ FILE: docs/zh/engines/command/connector-check.md ================================================ # 连接器检查命令用法 ## 命令入口 ```shell bin/seatunnel-connector.sh ``` ## 命令选项 ```text Usage: seatunnel-connector.sh [options] Options: -h, --help Show the usage message -l, --list List all supported plugins(sources, sinks, transforms) (default: false) -o, --option-rule Get option rule of the plugin by the plugin identifier(connector name or transform name) -pt, --plugin-type SeaTunnel plugin type, support [source, sink, transform] ``` ## 例子 ```shell # List all supported connectors(sources and sinks) and transforms bin/seatunnel-connector.sh -l # List all supported sinks bin/seatunnel-connector.sh -l -pt sink # Get option rule of the connector or transform by the name bin/seatunnel-connector.sh -o Paimon # Get option rule of paimon sink bin/seatunnel-connector.sh -o Paimon -pt sink ``` ================================================ FILE: docs/zh/engines/command/usage.mdx ================================================ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # 命令使用 ## 命令入口 ```bash bin/start-seatunnel-spark-2-connector-v2.sh ``` ```bash bin/start-seatunnel-spark-3-connector-v2.sh ``` ```bash bin/start-seatunnel-flink-13-connector-v2.sh ``` ```bash bin/start-seatunnel-flink-15-connector-v2.sh ``` ## 选项参数 ```bash 用法: start-seatunnel-spark-2-connector-v2.sh [选项] 选项: --check 是否检查配置 (默认: false) -c, --config 配置文件 -e, --deploy-mode Spark 部署模式,支持 [cluster, client] (默认: client) -h, --help 显示使用说明 -m, --master Spark master,支持 [spark://host:port, mesos://host:port, yarn, k8s://https://host:port, local],默认 local[*] (默认: local[*]) -n, --name SeaTunnel 作业名称 (默认: SeaTunnel) -i, --variable 变量替换,例如 -i city=beijing,或 -i date=20190318 (默认: []) ``` ```bash 用法: start-seatunnel-spark-3-connector-v2.sh [选项] 选项: --check 是否检查配置 (默认: false) -c, --config 配置文件 -e, --deploy-mode Spark 部署模式,支持 [cluster, client] (默认: client) -h, --help 显示使用说明 -m, --master Spark master,支持 [spark://host:port, mesos://host:port, yarn, k8s://https://host:port, local],默认 local[*] (默认: local[*]) -n, --name SeaTunnel 作业名称 (默认: SeaTunnel) -i, --variable 变量替换,例如 -i city=beijing,或 -i date=20190318 (默认: []) ``` ```bash 用法: start-seatunnel-flink-13-connector-v2.sh [选项] 选项: --check 是否检查配置 (默认: false) -c, --config 配置文件 -e, --deploy-mode Flink 作业部署模式,支持 [run, run-application] (默认: run) -h, --help 显示使用说明 --master, --target Flink 作业提交目标 master,支持 [local, remote, yarn-session, yarn-per-job, kubernetes-session, yarn-application, kubernetes-application] -n, --name SeaTunnel 作业名称 (默认: SeaTunnel) -i, --variable 变量替换,例如 -i city=beijing,或 -i date=20190318 (默认: []) ``` ```bash 用法: start-seatunnel-flink-15-connector-v2.sh [选项] 选项: --check 是否检查配置 (默认: false) -c, --config 配置文件 -e, --deploy-mode Flink 作业部署模式,支持 [run, run-application] (默认: run) -h, --help 显示使用说明 --master, --target Flink 作业提交目标 master,支持 [local, remote, yarn-session, yarn-per-job, kubernetes-session, yarn-application, kubernetes-application] -n, --name SeaTunnel 作业名称 (默认: SeaTunnel) -i, --variable 变量替换,例如 -i city=beijing,或 -i date=20190318 (默认: []) ``` ## 示例 ```bash bin/start-seatunnel-spark-2-connector-v2.sh --config config/v2.batch.config.template -m local -e client ``` ```bash bin/start-seatunnel-spark-3-connector-v2.sh --config config/v2.batch.config.template -m local -e client ``` ```bash bin/start-seatunnel-flink-13-connector-v2.sh --config config/v2.batch.config.template ``` ```bash bin/start-seatunnel-flink-15-connector-v2.sh --config config/v2.batch.config.template ``` ================================================ FILE: docs/zh/engines/event-listener.md ================================================ # 事件监听器 ## 介绍 SeaTunnel提供了丰富的事件监听器功能,用于管理数据同步时的状态。此功能在需要监听任务运行状态时十分重要(`org.apache.seatunnel.api.event`)。本文档将指导您如何使用这些参数并有效地利用他们。 ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    > Spark
    ## API 事件(event)API的定义在 `org.apache.seatunnel.api.event`包中。 ### Event Data API - `org.apache.seatunnel.api.event.Event` - 事件数据的接口。 - `org.apache.seatunnel.api.event.EventType` - 事件数据的枚举值。 #### EventType 枚举说明 `EventType`枚举定义了系统中所有可能的事件类型,主要包括: | 事件类型 | 说明 | 关联事件类 | |--------------------------------|----------|-------------------------------| | `JOB_STATUS` | 作业状态变更事件 | `JobStateEvent` | | `SCHEMA_CHANGE_UPDATE_COLUMNS` | 表结构更新事件 | `AlterTableColumnsEvent` | | `SCHEMA_CHANGE_ADD_COLUMN` | 表添加列事件 | `AlterTableAddColumnEvent` | | `SCHEMA_CHANGE_DROP_COLUMN` | 表删除列事件 | `AlterTableDropColumnEvent` | | `SCHEMA_CHANGE_MODIFY_COLUMN` | 表修改列事件 | `AlterTableModifyColumnEvent` | | `READER_OPEN` | 读取器打开事件 | `ReaderOpenEvent` | | `READER_CLOSE` | 读取器关闭事件 | `ReaderCloseEvent` | | `WRITER_OPEN` | 写入器打开事件 | `WriterOpenEvent` | | `WRITER_CLOSE` | 写入器关闭事件 | `WriterCloseEvent` | > 注意:不同事件类型对应不同的事件数据结构,在自定义事件处理器时需通过`event.getEventType()`进行类型判断,以确保类型安全转换。 ### Event Listener API 您可以自定义事件处理器,例如将事件发送到外部系统。 - `org.apache.seatunnel.api.event.EventHandler` - 事件处理器的接口,SPI将会自动从类路径中加载子类。 ### Event Collect API - `org.apache.seatunnel.api.source.SourceSplitEnumerator` - 在`SourceSplitEnumerator`加载事件监听器。 ```java package org.apache.seatunnel.api.source; public interface SourceSplitEnumerator { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this enumerator. * * @return */ EventListener getEventListener(); } } ``` - `org.apache.seatunnel.api.source.SourceReader` - 在`SourceReader`加载事件监听器。 ```java package org.apache.seatunnel.api.source; public interface SourceReader { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this reader. * * @return */ EventListener getEventListener(); } } ``` - `org.apache.seatunnel.api.sink.SinkWriter` - 在`SinkWriter`加载事件监听器。 ```java package org.apache.seatunnel.api.sink; public interface SinkWriter { interface Context { /** * Get the {@link org.apache.seatunnel.api.event.EventListener} of this writer. * * @return */ EventListener getEventListener(); } } ``` ## 设置监听器 您需要设置引擎配置以使用事件监听器功能。 ### Zeta 引擎 配置样例(seatunnel.yaml): ``` seatunnel: engine: event-report-http: url: "http://example.com:1024/event/report" headers: Content-Type: application/json ``` ### Flink 引擎 您可以定义 `org.apache.seatunnel.api.event.EventHandler` 接口并添加到类路径,SPI会自动加载。 支持的flink版本: 1.14.0+ 样例: `org.apache.seatunnel.api.event.LoggingEventHandler` ### Spark 引擎 您可以定义 `org.apache.seatunnel.api.event.EventHandler` 接口并添加到类路径,SPI会自动加载。 ## 自定义事件处理器实现步骤 下面以 `JobStateEvent` 为例,介绍如何实现一个自定义事件处理器,您可以根据需要扩展此方法以处理其他类型的事件。 ### 1. 添加依赖 在项目 `pom.xml` 中引入必要依赖: ```xml org.apache.seatunnel seatunnel-api ${seatunnel.version} provided org.apache.seatunnel seatunnel-engine-common ${seatunnel.version} provided ``` > 注意:需将 `${seatunnel.version}` 替换为实际使用的 SeaTunnel 版本。 ### 2. 实现事件处理器 自定义类实现 `org.apache.seatunnel.api.event.EventHandler` 接口,并重写 `handle` 方法,针对需要处理的事件类型进行业务逻辑处理。 **核心逻辑**:通过 `event.getEventType()` 过滤事件类型——由于 SeaTunnel 引擎会分发多种类型的事件,需显式判断事件类型,以确保仅处理目标事件。 ```java import lombok.extern.slf4j.Slf4j; import org.apache.seatunnel.api.event.Event; import org.apache.seatunnel.api.event.EventHandler; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.engine.common.job.JobStatus; import org.apache.seatunnel.engine.common.job.JobStateEvent; import org.apache.seatunnel.api.event.schema.AlterTableAddColumnEvent; import org.apache.seatunnel.api.event.source.ReaderOpenEvent; import org.apache.seatunnel.api.event.sink.WriterCloseEvent; /** * 自定义多类型事件处理器示例,包含多种事件的处理逻辑 */ @Slf4j public class CustomMultiEventHandler implements EventHandler { @Override public void handle(Event event) { // 根据事件类型进行不同处理 EventType eventType = event.getEventType(); switch (eventType) { case JOB_STATUS: handleJobStateEvent((JobStateEvent) event); break; case SCHEMA_CHANGE_ADD_COLUMN: handleAddColumnEvent((AlterTableAddColumnEvent) event); break; case READER_OPEN: handleReaderOpenEvent((ReaderOpenEvent) event); break; case WRITER_CLOSE: handleWriterCloseEvent((WriterCloseEvent) event); break; // 可根据需要添加其他事件类型的处理 default: // 忽略不处理的事件类型 log.debug("忽略未处理的事件类型: {}", eventType); } } /** * 处理作业状态事件 */ private void handleJobStateEvent(JobStateEvent jobEvent) { String jobId = jobEvent.getJobId(); String jobName = jobEvent.getJobName(); JobStatus status = jobEvent.getJobStatus(); long eventTime = jobEvent.getCreatedTime(); switch (status) { case FAILED: log.error("任务失败 | jobId: {}, jobName: {}, 时间: {}", jobId, jobName, eventTime); // 添加失败告警逻辑 sendAlert("任务失败", "jobId: " + jobId); break; case FINISHED: log.info("任务完成 | jobId: {}, jobName: {}, 时间: {}", jobId, jobName, eventTime); break; // 处理其他状态... default: log.info("任务状态变更 | jobId: {}, 状态: {}, 时间: {}", jobId, status, eventTime); } } /** * 处理表添加列事件 */ private void handleAddColumnEvent(AlterTableAddColumnEvent event) { log.info("表添加列 | 表名: {}, 新增列: {}, 时间: {}", event.getTableName(), event.getAddedColumns(), event.getEventTime()); // 处理表结构变更逻辑 } /** * 处理读取器打开事件 */ private void handleReaderOpenEvent(ReaderOpenEvent event) { log.info("读取器打开 | 插件ID: {}, 并行度: {}, 时间: {}", event.getPluginId(), event.getParallelism(), event.getEventTime()); // 处理读取器初始化逻辑 } /** * 处理写入器关闭事件 */ private void handleWriterCloseEvent(WriterCloseEvent event) { log.info("写入器关闭 | 插件ID: {}, 处理记录数: {}, 时间: {}", event.getPluginId(), event.getRecordCount(), event.getEventTime()); // 处理写入器资源清理逻辑 } /** * 发送告警通知 */ private void sendAlert(String title, String content) { // 实现告警逻辑(如调用HTTP接口、发送邮件等) log.info("[告警] {}: {}", title, content); } } ``` ### 3. 配置 SPI 加载 为使引擎自动发现并加载自定义处理器,需在项目资源目录中添加 SPI 配置文件: 1. 创建目录:`src/main/resources/META-INF/services/` 2. 新建文件:`org.apache.seatunnel.api.event.EventHandler` 3. 在文件中添加自定义处理器的全类名: ``` com.example.CustomMultiEventHandler ``` ### 4. 部署与验证 - 将包含自定义处理器的 JAR 包放入 SeaTunnel 引擎的类路径(如 `lib/` 目录) - 启动任务后,当对应事件发生时,处理器会自动触发并执行相应的处理逻辑 - 可通过日志输出验证处理器是否生效 ### 注意事项 - 处理器逻辑应尽量轻量,避免阻塞事件处理线程 - 若需网络调用(如发送告警),建议使用异步方式实现,防止超时影响任务本身 - 不同引擎对事件的支持情况可能不同,例如 `JobStateEvent` 目前仅支持 Zeta 引擎 - 事件类型与事件类是一一对应的,转换时需确保类型匹配,避免 `ClassCastException` - 可以根据业务需求,实现多个事件处理器分别处理不同类型的事件,也可以在一个处理器中处理多种事件类型 通过上述步骤,您可以灵活地监听和处理 SeaTunnel 中的各种事件,实现自定义的业务逻辑,如状态监控、告警通知、数据统计等功能。 ================================================ FILE: docs/zh/engines/flink.md ================================================ # Flink引擎方式运行SeaTunnel Flink是一个强大的高性能分布式流处理引擎。你可以搜索 `Apache Flink`获取更多关于它的信息。 ### 在Job中设置Flink的配置信息 以 `flink.` 开始: 例子: 我对这个项目设置一个精确的检查点 ``` env { parallelism = 1 flink.execution.checkpointing.unaligned.enabled=true } ``` 枚举类型当前还不支持,你需要在Flink的配置文件中指定它们。暂时只有这些类型的设置受支持:
    Integer/Boolean/String/Duration ### 如何设置一个简单的Flink Job 这是一个运行在Flink中随机生成数据打印到控制台的简单job ``` env { # 公共参数 parallelism = 1 checkpoint.interval = 5000 # flink特殊参数 flink.execution.checkpointing.mode = "EXACTLY_ONCE" flink.execution.checkpointing.timeout = 600000 } source { FakeSource { row.num = 16 plugin_output = "fake_table" schema = { fields { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(33, 18)" c_timestamp = timestamp c_row = { c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_int = int c_bigint = bigint c_double = double c_bytes = bytes c_date = date c_decimal = "decimal(33, 18)" c_timestamp = timestamp } } } } } transform { # 如果你想知道更多关于如何配置seatunnel的信息和查看完整的transform插件, # 请访问:https://seatunnel.apache.org/docs/transforms/sql } sink{ Console{} } ``` ### 如何在项目中运行Job 当你将代码拉到本地后,转到 `seatunnel-examples/seatunnel-flink-connector-v2-example` 模块,查找 `org.apache.seatunnel.example.flink.v2.SeaTunnelApiExample` 即可完成job的操作。 ================================================ FILE: docs/zh/engines/overview.md ================================================ --- sidebar_position: 1 --- # 引擎概览 SeaTunnel 支持多种执行引擎,您可以根据实际场景选择最合适的引擎。本文档提供全面的对比分析,帮助您做出正确的选择。 ## 支持的引擎 | 引擎 | 描述 | 推荐场景 | |------|------|---------| | **SeaTunnel Engine (Zeta)** | 专为数据集成构建的原生引擎 | 新项目、数据同步 | | **Apache Flink** | 分布式流处理引擎 | 已有 Flink 基础设施 | | **Apache Spark** | 分布式批流处理引擎 | 已有 Spark 基础设施 | ## 快速对比 ### 功能对比 | 功能 | SeaTunnel Engine | Flink | Spark | |------|------------------|-------|-------| | **批处理** | ✅ | ✅ | ✅ | | **流处理** | ✅ | ✅ | ✅ | | **CDC 支持** | ✅ | ✅ | ❌ | | **精确一次** | ✅ | ✅ | ✅ | | **多表同步** | ✅ | ✅ | ✅ | | **Schema 演变** | ✅ | ✅ | ❌ | | **REST API** | ✅ | ✅ | ❌ | | **Web UI** | ✅ | ✅ | ✅ | | **单机模式** | ✅ | ✅ | ✅ | | **集群模式** | ✅ | ✅ | ✅ | ### 性能对比 | 指标 | SeaTunnel Engine | Flink | Spark | |------|------------------|-------|-------| | **吞吐量** | ⭐⭐⭐ 高 | ⭐⭐ 中 | ⭐⭐ 中 | | **延迟** | ⭐⭐⭐ 低 | ⭐⭐⭐ 低 | ⭐⭐ 中 | | **资源消耗** | ⭐⭐⭐ 低 | ⭐⭐ 中 | ⭐ 高 | | **启动速度** | ⭐⭐⭐ 快 | ⭐⭐ 中 | ⭐ 慢 | ### 易用性对比 | 方面 | SeaTunnel Engine | Flink | Spark | |------|------------------|-------|-------| | **安装部署** | ⭐⭐⭐ 简单 | ⭐⭐ 中等 | ⭐⭐ 中等 | | **配置复杂度** | ⭐⭐⭐ 简单 | ⭐⭐ 中等 | ⭐⭐ 中等 | | **外部依赖** | ⭐⭐⭐ 无 | ⭐⭐ Zookeeper (可选) | ⭐ YARN/Mesos | | **学习曲线** | ⭐⭐⭐ 平缓 | ⭐⭐ 中等 | ⭐⭐ 中等 | ## 引擎选择指南 ### SeaTunnel Engine (Zeta) - 推荐 **适用场景:** - 新的数据集成项目 - 数据同步和 CDC 场景 - 没有现有大数据基础设施的用户 - 需要低资源消耗的场景 - 大量小表的实时同步 **核心优势:** - 无外部依赖(不需要 Zookeeper、HDFS) - 专为数据同步场景优化 - 动态线程共享,高效利用资源 - Pipeline 级别的容错机制 - 内置集群管理和高可用 - JDBC 连接复用 **典型用例:** - MySQL 到 ClickHouse 实时同步 - 多表 CDC 同步 - 数据库迁移项目 ### Apache Flink **适用场景:** - 已有 Flink 基础设施的组织 - 复杂的流处理需求 - 需要与 Flink 生态集成的场景 **核心优势:** - 成熟的流处理能力 - 丰富的生态系统和社区 - 高级状态管理 - 与 Flink SQL 集成 **典型用例:** - 与现有 Flink 管道集成 - 复杂事件处理 - 需要 Flink 特定功能的场景 ### Apache Spark **适用场景:** - 已有 Spark 基础设施的组织 - 大规模批处理 - 需要与 Spark 生态集成(MLlib、GraphX) **核心优势:** - 成熟的批处理能力 - 丰富的生态系统 - 与 Hive、HDFS 集成 - 支持 YARN、Kubernetes **典型用例:** - 大规模 ETL 作业 - 与现有 Spark 工作流集成 - 批量数据仓库加载 ## 决策流程图 ``` 开始 │ ▼ 是否有现有的 Flink/Spark 基础设施? │ ├─ 是 ──► 是否想要复用? │ │ │ ├─ 是 (Flink) ──► 使用 Flink 引擎 │ │ │ ├─ 是 (Spark) ──► 使用 Spark 引擎 │ │ │ └─ 否 ──► 使用 SeaTunnel Engine │ └─ 否 ──► 使用 SeaTunnel Engine(推荐) ``` ## 配置示例 ### SeaTunnel Engine ```hocon env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 10000 } ``` ### Flink 引擎 ```hocon env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 10000 flink.execution.checkpointing.mode = "EXACTLY_ONCE" flink.execution.checkpointing.timeout = 600000 } ``` ### Spark 引擎 ```hocon env { parallelism = 2 job.mode = "BATCH" spark.app.name = "SeaTunnel-Job" spark.executor.memory = "2g" spark.executor.instances = "2" } ``` ## 连接器兼容性 所有 SeaTunnel V2 连接器都与三种引擎兼容。但某些功能在不同引擎上可能有不同的行为: | 连接器功能 | SeaTunnel Engine | Flink | Spark | |-----------|------------------|-------|-------| | CDC 连接器 | ✅ 完全支持 | ✅ 完全支持 | ❌ 不支持 | | 精确一次写入 | ✅ 完全支持 | ✅ 完全支持 | ✅ 部分支持 | | 多表读取 | ✅ 完全支持 | ✅ 完全支持 | ✅ 完全支持 | ## 迁移指南 ### 从 Flink 迁移到 SeaTunnel Engine 1. 移除 Flink 特定配置(以 `flink.` 为前缀的配置) 2. 保留通用配置(`parallelism`、`checkpoint.interval`) 3. 使用 SeaTunnel Engine 测试 ### 从 Spark 迁移到 SeaTunnel Engine 1. 移除 Spark 特定配置(以 `spark.` 为前缀的配置) 2. 保留通用配置(`parallelism`、`job.mode`) 3. 使用 SeaTunnel Engine 测试 ## 总结 | 场景 | 推荐引擎 | |------|---------| | 没有大数据基础设施的新项目 | **SeaTunnel Engine** | | CDC 和实时同步 | **SeaTunnel Engine** | | 已有 Flink 基础设施 | **Flink** | | 已有 Spark 基础设施 | **Spark** | | 低资源环境 | **SeaTunnel Engine** | | 复杂流处理 | **Flink** | | 大规模批量 ETL | **Spark** | ## 下一步 - [SeaTunnel Engine 快速开始](zeta/about.md) - [Flink 引擎指南](flink.md) - [Spark 引擎指南](spark.md) ================================================ FILE: docs/zh/engines/spark.md ================================================ # SeaTunnel 通过 Spark 引擎运行 Spark 是一个强大的高性能分布式计算处理引擎。有关它的更多信息,您可以搜索"Apache Spark" ### 如何在作业中设置 Spark 配置信息 例: 我为这个任务设置了一些 spark 配置项 ``` env { spark.app.name = "example" spark.sql.catalogImplementation = "hive" spark.executor.memory= "2g" spark.executor.instances = "2" spark.yarn.priority = "100' hive.exec.dynamic.partition.mode = "nonstrict" spark.dynamicAllocation.enabled="false" } ``` ### 命令行示例 #### Spark on Yarn集群 ``` ./bin/start-seatunnel-spark-3-connector-v2.sh --master yarn --deploy-mode cluster --config config/example.conf ``` #### Spark on Yarn集群 ``` ./bin/start-seatunnel-spark-3-connector-v2.sh --master yarn --deploy-mode client --config config/example.conf ``` ### 如何设置简单的 Spark 作业 这是通过 Spark 运行的一个简单作业。会将随机生成的数据输出到控制台 ``` env { # common parameter parallelism = 1 # spark special parameter spark.app.name = "example" spark.sql.catalogImplementation = "hive" spark.executor.memory= "2g" spark.executor.instances = "1" spark.yarn.priority = "100" hive.exec.dynamic.partition.mode = "nonstrict" spark.dynamicAllocation.enabled="false" } source { FakeSource { schema = { fields { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp c_row = { c_map = "map>" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(30, 8)" c_null = "null" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } } transform { } sink{ Console{} } ``` ### 如何在项目中运行作业 将代码拉取到本地后,进入 seatunnel-examples/seatunnel-spark-connector-v2-example 模块,找到 org.apache.seatunnel.example.spark.v2.SeaTunnelApiExample 来完成作业的运行。 ================================================ FILE: docs/zh/engines/zeta/about.md ================================================ --- sidebar_position: 1 --- # SeaTunnel Engine 简介 SeaTunnel Engine 是一个由社区开发的用于数据同步场景的引擎,作为 SeaTunnel 的默认引擎,它支持高吞吐量、低延迟和强一致性的数据同步作业操作,更快、更稳定、更节省资源且易于使用。 SeaTunnel Engine 的整体设计遵循以下路径: - 更快,SeaTunnel Engine 的执行计划优化器旨在减少数据网络传输,从而减少由于数据序列化和反序列化造成的整体同步性能损失,使用户能够更快地完成数据同步操作。同时,支持速度限制,以合理速度同步数据。 - 更稳定,SeaTunnel Engine 使用 Pipeline 作为数据同步任务的最小粒度的检查点和容错。任务的失败只会影响其上游和下游任务,避免了任务失败导致整个作业失败或回滚的情况。同时,SeaTunnel Engine 还支持数据缓存,用于源数据有存储时间限制的场景。当启用缓存时,从源读取的数据将自动缓存,然后由下游任务读取并写入目标。在这种情况下,即使由于目标失败而无法写入数据,也不会影响源的常规读取,防止源数据过期被删除。 - 节省空间,SeaTunnel Engine 内部使用动态线程共享技术。在实时同步场景中,对于每个表数据量很大但每个表数据量很小的表,SeaTunnel Engine 将在共享线程中运行这些同步任务,以减少不必要的线程创建并节省系统空间。在读取和写入数据方面,SeaTunnel Engine 的设计目标是最小化 JDBC 连接的数量;在 CDC 场景中,SeaTunnel Engine 将重用日志读取和解析资源。 - 简单易用,SeaTunnel Engine 减少了对第三方服务的依赖,并且可以独立于如 Zookeeper 和 HDFS 等大数据组件实现集群管理、快照存储和集群 HA 功能。这对于目前缺乏大数据平台的用户,或者不愿意依赖大数据平台进行数据同步的用户来说非常有用。 未来,SeaTunnel Engine 将进一步优化其功能,以支持离线批同步的全量同步和增量同步、实时同步和 CDC。 ### 集群管理 - 支持独立运行; - 支持集群运行; - 支持自治集群(去中心化),使用户无需为 SeaTunnel Engine 集群指定主节点,因为它可以在运行过程中自行选择主节点,并且在主节点失败时自动选择新的主节点; - 自治集群节点发现和具有相同 cluster_name 的节点将自动形成集群。 ### 核心功能 - 支持在本地模式下运行作业,作业完成后集群自动销毁; - 支持在集群模式下运行作业(单机或集群),通过 SeaTunnel 客户端将作业提交给 SeaTunnel Engine 服务,作业完成后服务继续运行并等待下一个作业提交; - 支持离线批同步; - 支持实时同步; - 批流一体,所有 SeaTunnel V2 Connector 均可在 SeaTunnel Engine 中运行; - 支持分布式快照算法,并支持与 SeaTunnel V2 Connector 的两阶段提交,确保数据只执行一次。 - 支持在 Pipeline 级别调用作业,以确保即使在资源有限的情况下也能启动; - 支持在 Pipeline 级别对作业进行容错。任务失败只影响其所在 Pipeline,只需要回滚 Pipeline 下的任务; - 支持动态线程共享,以实时同步大量小数据集。 ### 快速开始 https://seatunnel.apache.org/docs/getting-started/locally/quick-start-seatunnel-engine ### 下载安装 [下载安装](download-seatunnel.md) ================================================ FILE: docs/zh/engines/zeta/checkpoint-storage.md ================================================ --- sidebar_position: 7 --- # 检查点存储 ## 简介 检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。 ### 检查点存储 SeaTunnel Engine支持以下检查点存储类型: - HDFS (OSS,COS,S3,HDFS,LocalFile) - LocalFile (本地),(已弃用: 使用HDFS(LocalFile)替代). 我们使用微内核设计模式将检查点存储模块从引擎中分离出来。这允许用户实现他们自己的检查点存储模块。 `checkpoint-storage-api`是检查点 存储模块API,它定义了检查点存储模块的接口。 如果你想实现你自己的检查点存储模块,你需要实现`CheckpointStorage`并提供相应的`CheckpointStorageFactory`实现。 ### 检查点存储配置 `seatunnel-server`模块的配置在`seatunnel.yaml`文件中。 ```yaml seatunnel: engine: checkpoint: storage: type: hdfs # 检查点存储的插件名称,支持hdfs(S3, local, hdfs), 默认为localfile (本地文件), 但这种方式已弃用 # 插件配置 plugin-config: namespace: #检查点存储父路径,默认值为/seatunnel/checkpoint/ K1: V1 # 插件其它配置 K2: V2 # 插件其它配置 ``` 注意: namespace必须以"/"结尾。 #### OSS 阿里云OSS是基于hdfs-file,所以你可以参考[Hadoop OSS文档](https://hadoop.apache.org/docs/stable/hadoop-aliyun/tools/hadoop-aliyun/index.html)来配置oss. OSS buckets交互外,oss客户端需要与buckets交互所需的凭据。 客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用of org.apache.hadoop.fs.aliyun.oss.AliyunCredentialsProvider的自定义实现。 如果您使用AliyunCredentialsProvider(可以从阿里云访问密钥管理中获得),它们包括一个access key和一个secret key。 你可以这样配置: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: oss oss.bucket: your-bucket fs.oss.accessKeyId: your-access-key fs.oss.accessKeySecret: your-secret-key fs.oss.endpoint: endpoint address ``` 有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). 阿里云OSS凭证提供程序实现见: [验证凭证提供](https://github.com/aliyun/aliyun-oss-java-sdk/tree/master/src/main/java/com/aliyun/oss/common/auth) #### COS 腾讯云COS基于hdfs-file,所以你可以参考[Hadoop COS文档](https://hadoop.apache.org/docs/stable/hadoop-cos/cloud-storage/)来配置COS. 除了与公共COS buckets交互之外,COS客户端需要与buckets交互所需的凭据。 客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用com.qcloud.cos.auth.COSCredentialsProvider的自定义实现。 如果您使用SimpleCredentialsProvider(可以从腾讯云API密钥管理中获得),它们包括一个secretId和一个secretKey。 您可以这样配置: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: cos cos.bucket: cosn://your-bucket fs.cosn.credentials.provider: org.apache.hadoop.fs.cosn.auth.SimpleCredentialsProvider fs.cosn.userinfo.secretId: your-secretId fs.cosn.userinfo.secretKey: your-secretKey fs.cosn.bucket.region: your-region ``` 有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). 腾讯云COS相关配置可参考:[Tencent Hadoop-COS文档](https://doc.fincloud.tencent.cn/tcloud/Storage/COS/846365/hadoop) 使用前请将如下jar添加到lib目录下: - [hadoop-cos-3.4.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-cos/3.4.1) - [cos_api-bundle-5.6.69.jar](https://mvnrepository.com/artifact/com.qcloud/cos_api-bundle/5.6.69) - [hadoop-shaded-guava-1.1.1.jar](https://mvnrepository.com/artifact/org.apache.hadoop.thirdparty/hadoop-shaded-guava/1.1.1) #### S3 S3基于hdfs-file,所以你可以参考[Hadoop s3文档](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)来配置s3。 除了与公共S3 buckets交互之外,S3A客户端需要与buckets交互所需的凭据。 客户端支持多种身份验证机制,并且可以配置使用哪种机制及其使用顺序。也可以使用com.amazonaws.auth.AWSCredentialsProvider的自定义实现。 如果您使用SimpleAWSCredentialsProvider(可以从Amazon Security Token服务中获得),它们包括一个access key和一个secret key。 您可以这样配置: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: s3 s3.bucket: your-bucket fs.s3a.access.key: your-access-key fs.s3a.secret.key: your-secret-key fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ``` 如果您使用`InstanceProfileCredentialsProvider`,它支持在EC2 VM中运行时使用实例配置文件凭据,您可以检查[iam-roles-for-amazon-ec2](https://docs.aws.amazon.com/zh_cn/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html). 您可以这样配置: ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: s3 s3.bucket: your-bucket fs.s3a.endpoint: your-endpoint fs.s3a.aws.credentials.provider: org.apache.hadoop.fs.s3a.InstanceProfileCredentialsProvider ``` 有关Hadoop Credential Provider API的更多信息,请参见: [Credential Provider API](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CredentialProviderAPI.html). #### HDFS 如果您使用HDFS,您可以这样配置: ```yaml seatunnel: engine: checkpoint: storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 // 如果您使用kerberos,您可以这样配置: kerberosPrincipal: your-kerberos-principal kerberosKeytabFilePath: your-kerberos-keytab ``` 如果HDFS是HA模式,您可以这样配置: ```yaml seatunnel: engine: checkpoint: storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: hdfs://usdp-bing seatunnel.hadoop.dfs.nameservices: usdp-bing seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2 seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020 seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020 seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider ``` 如果HDFS在`hdfs-site.xml`或`core-site.xml`中有其他配置,只需使用`seatunnel.hadoop.`前缀设置HDFS配置即可。 #### 本地文件 ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: hdfs fs.defaultFS: file:/// # 请确保该目录具有写权限 ``` ### 开启高速缓存 当storage:type为hdfs时,默认关闭cache。如果您想启用它,请设置为`disable.cache: false`。 ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: hdfs disable.cache: false fs.defaultFS: hdfs:/// # Ensure that the directory has written permission ``` or ```yaml seatunnel: engine: checkpoint: interval: 6000 timeout: 7000 storage: type: hdfs max-retained: 3 plugin-config: namespace: # 检查点存储父路径,默认值为/seatunnel/checkpoint/ storage.type: hdfs disable.cache: false fs.defaultFS: file:/// ``` ================================================ FILE: docs/zh/engines/zeta/deployment.md ================================================ --- sidebar_position: 3 --- # SeaTunnel Engine(Zeta) 安装部署 SeaTunnel Engine(Zeta) 支持三种不同的部署模式:本地模式、混合集群模式和分离集群模式。 每种部署模式都有不同的使用场景和优缺点。在选择部署模式时,您应该根据您的需求和环境来选择。 Local模式:只用于测试,每个任务都会启动一个独立的进程,任务运行完成后进程会退出。 混合集群模式:SeaTunnel Engine 的Master服务和Worker服务混合在同一个进程中,所有节点都可以运行作业并参与选举成为master,即master节点也在同时运行同步任务。在该模式下,Imap(保存任务的状态信息用于为任务的容错提供支持)数据会分布在所有节点中。 分离集群模式:SeaTunnel Engine 的Master服务和Worker服务分离,每个服务单独一个进程。Master节点只负责作业调度,rest api,任务提交等,Imap数据只存储在Master节点中。Worker节点只负责任务的执行,不参与选举成为master,也不存储Imap数据。 使用建议:建议使用[分离集群模式](separated-cluster-deployment.md)。在混合集群模式下,Master节点要同步运行任务,当任务规模较大时,会影响Master节点的稳定性,一但Master节点宕机或心跳超时,会导致Master节点切换,Master节点切换会导致所有正在运行的任务进行容错,会进一步增长集群的负载。因此,我们更建议使用分离模式。 [Local模式部署](local-mode-deployment.md) [混合集群模式部署](hybrid-cluster-deployment.md) [分离集群模式部署](separated-cluster-deployment.md) ================================================ FILE: docs/zh/engines/zeta/download-seatunnel.md ================================================ --- sidebar_position: 2 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # 下载和制作安装包 ## 步骤 1: 准备工作 在开始下载SeaTunnel之前,您需要确保您已经安装了SeaTunnel所需要的以下软件: * 安装[Java](https://www.java.com/en/download/) (Java 8 或 11, 其他高于Java 8的版本理论上也可以工作) 以及设置 `JAVA_HOME`。 ## 步骤 2: 下载 SeaTunnel 进入[SeaTunnel下载页面](https://seatunnel.apache.org/download)下载最新版本的发布版安装包`seatunnel--bin.tar.gz` 或者您也可以通过终端下载 ```shell export version="3.0.0" wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` ## 步骤 3: 下载连接器插件 从2.2.0-beta版本开始,二进制包不再默认提供连接器依赖,因此在第一次使用它时,您需要执行以下命令来安装连接器:(当然,您也可以从 [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) 手动下载连接器,然后将其移动至`connectors/seatunnel`目录下)。 ```bash sh bin/install-plugin.sh 3.0.0 ``` 如果您需要指定的连接器版本,以3.0.0为例,您需要执行如下命令 ```bash sh bin/install-plugin.sh 3.0.0 ``` 通常您并不需要所有的连接器插件,所以您可以通过配置`config/plugin_config`来指定您所需要的插件,例如,您只需要`connector-console`插件,那么您可以修改plugin.properties配置文件如下 ```plugin_config --seatunnel-connectors-- connector-console --end-- ``` 如果您希望示例应用程序能正常工作,那么您需要添加以下插件 ```plugin_config --seatunnel-connectors-- connector-fake connector-console --end-- ``` 您可以在`${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`下找到所有支持的连接器和相应的plugin_config配置名称。 :::tip 提示 如果您想通过手动下载连接器的方式来安装连接器插件,您只需要下载您所需要的连接器插件即可,并将它们放在`${SEATUNNEL_HOME}/connectors/`目录下 ::: 现在你已经完成了SeaTunnel安装包的下载和连接器插件的下载。接下来,您可以根据您的需求选择不同的运行模式来运行或部署SeaTunnel。 如果你使用SeaTunnel自带的SeaTunnel Engine(Zeta)来运行任务,需要先部署SeaTunnel Engine服务。参考[SeaTunnel Engine(Zeta)服务部署](deployment.md) ================================================ FILE: docs/zh/engines/zeta/engine-jar-storage-mode.md ================================================ --- sidebar_position: 9 --- # 配置引擎 Jar 存储模式 :::caution 警告 请注意,此功能目前处于实验阶段,还有许多方面需要改进。因此,我们建议在使用此功能时谨慎行事,以避免潜在的问题和不必要的风险。 我们致力于持续努力增强和稳定此功能,确保为您提供更好的体验。 ::: 我们可以启用优化的作业提交过程,这在 `seatunnel.yaml` 中进行配置。启用了 Seatunnel 作业提交过程配置项的优化后, 用户可以使用 Seatunnel Zeta 引擎作为执行引擎,而无需在每个引擎 `connector` 目录中放置任务执行所需的连接器 Jar 包或连接器所依赖的第三方 Jar 包。 用户只需在提交作业的客户端上放置所有任务执行所需的 Jar 包,客户端将自动上传任务执行所需的 Jars 到 Zeta 引擎。在 Docker 或 k8s 模式下提交作业时,启用此配置项是必要的, 这可以从根本上解决由 Seatunnel Zeta 引擎的重量造成的大型容器镜像问题。在镜像中,只需要提供 Zeta 引擎的核心框架包, 然后可以将连接器的 jar 包和连接器所依赖的第三方 jar 包分别上传到 pod 进行分发。 启用了优化作业提交过程配置项后,您不需要在 Zeta 引擎中放置以下两种类型的 Jar 包: - COMMON_PLUGIN_JARS - CONNECTOR_PLUGIN_JARS COMMON_ PLUGIN_ JARS 指的是连接器所依赖的第三方 Jar 包, CONNECTOR_ PLUGIN_ JARS 指的是连接器 Jar 包。 当 Zeta 的 `lib` 中不存在公共 jars 时,它可以将客户端的本地公共 jars 上传到所有引擎节点的 `lib` 目录。 这样,即使用户没有在 Zeta 的 `lib` 中放置 jar,任务仍然可以正常执行。 然而,我们不推荐依赖打开优化作业提交过程的配置项来上传连接器所依赖的第三方 Jar 包。 如果您使用 Zeta 引擎,请将连接器所依赖的第三方 jar 包文件添加到每个节点的 `$SEATUNNEL_HOME/lib/` 目录中,例如 jdbc 驱动程序。 # 连接器 Jar 存储策略 您可以通过配置文件配置当前连接器 Jar 包和连接器所依赖的第三方 Jar 包的存储策略。 可以配置两种存储策略,即共享 Jar 包存储策略和隔离 Jar 包存储策略。 两种不同的存储策略为 Jar 文件提供了更灵活的存储模式。 您可以配置存储策略,使引擎中的多个执行作业共享相同的 Jar 包文件。 ## 相关配置 | 参数 | 默认值 | 描述 | |-------------------------------------|--------|-------------------------------------------------------------------------| | connector-jar-storage-enable | false | 是否启用上传连接器 Jar 包到引擎。默认启用状态为 false。 | | connector-jar-storage-mode | SHARED | 引擎端 Jar 包存储模式选择。有两个可选模式,SHARED(共享)和 ISOLATED(隔离)。默认的 Jar 包存储模式是 SHARED。 | | connector-jar-storage-path | " " | 用户自定义的 Jar 包存储路径。 | | connector-jar-cleanup-task-interval | 3600s | 引擎端 Jar 包清理定时任务执行间隔。 | | connector-jar-expiry-time | 600s | 引擎端 Jar 包存储过期时间。 | ## 隔离连接器Jar存储策略 在作业提交之前,连接器 Jar 包将被上传到 Master 节点上的一个独立文件存储路径中。 不同作业的连接器 Jar 包位于不同的存储路径中,因此不同作业的连接器 Jar 包彼此隔离。 作业执行所需的 Jar 包文件不会影响其他作业。当当前作业执行结束时,基于 `JobId` 生成的存储路径中的 Jar 包文件将被删除。 示例: ```yaml jar-storage: connector-jar-storage-enable: true connector-jar-storage-mode: ISOLATED connector-jar-storage-path: "" connector-jar-cleanup-task-interval: 3600 connector-jar-expiry-time: 600 ``` 配置参数的详细解释: - connector-jar-storage-enable: 在执行作业前启用上传连接器 Jar 包的功能。 - connector-jar-storage-mode: 连接器 Jar 包的存储模式,有两种存储模式可供选择:共享模式(SHARED)和隔离模式(ISOLATED)。 - connector-jar-storage-path: 在 Zeta 引擎上用户自定义连接器 Jar 包的本地存储路径。 - connector-jar-cleanup-task-interval: Zeta 引擎连接器 Jar 包定时清理任务的间隔时间,默认为 3600 秒。 - connector-jar-expiry-time: 连接器 Jar 包的过期时间,默认为 600 秒。 ## 共享连接器Jar存储策略 在作业提交之前,连接器 Jar 包将被上传到 Master 节点。如果不同的作业使用相同的 Jar 包文件,它们可以在 Master 节点上共享连接器 Jars。 所有 Jar 包文件都被持久化到一个共享的文件存储路径中,引用 Master 节点的 Jar 包可以在不同作业之间共享。任务执行完成后, 共享连接器Jar存储策略 不会立即删除与当前任务执行相关的所有 Jar 包,而是有一个独立的线程负责清理工作。 以下配置文件中的配置设置了清理工作的运行时间和 Jar 包的存活时间。 示例: ```yaml jar-storage: connector-jar-storage-enable: true connector-jar-storage-mode: SHARED connector-jar-storage-path: "" connector-jar-cleanup-task-interval: 3600 connector-jar-expiry-time: 600 ``` 配置参数的详细解释: - connector-jar-storage-enable: 在执行作业前启用上传连接器 Jar 包的功能。 - connector-jar-storage-mode: 连接器 Jar 包的存储模式,有两种存储模式可供选择:共享模式(SHARED)和隔离模式(ISOLATED)。 - connector-jar-storage-path: 在 Zeta 引擎上用户自定义连接器 Jar 包的本地存储路径。 - connector-jar-cleanup-task-interval: Zeta 引擎连接器 Jar 包定时清理任务的间隔时间,默认为 3600 秒。 - connector-jar-expiry-time: 连接器 Jar 包的过期时间,默认为 600 秒。 ================================================ FILE: docs/zh/engines/zeta/hybrid-cluster-deployment.md ================================================ --- sidebar_position: 5 --- # 部署 SeaTunnel Engine 混合模式集群 SeaTunnel Engine 的Master服务和Worker服务混合在同一个进程中,所有节点都可以运行作业并参与选举成为master,即master节点也在同时运行同步任务。在该模式下,Imap(保存任务的状态信息用于为任务的容错提供支持)数据会分布在所有节点中。 使用建议:建议使用分离集群模式。在混合集群模式下,Master节点要同步运行任务,当任务规模较大时,会影响Master节点的稳定性,一但Master节点宕机或心跳超时,会导致Master节点切换,Master节点切换会导致所有正在运行的任务进行容错,会进一步增长集群的负载。因此,我们更建议使用[分离集群模式](separated-cluster-deployment.md)。 ## 1. 下载 [下载和制作SeaTunnel安装包](download-seatunnel.md) ## 2 配置 SEATUNNEL_HOME 您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ## 3. 配置 SeaTunnel Engine JVM 选项 SeaTunnel Engine 支持两种设置 JVM 选项的方法。 1. 将 JVM 选项添加到 `$SEATUNNEL_HOME/config/jvm_options`. 修改 `$SEATUNNEL_HOME/config/jvm_options` 文件中的jvm参数。 2. 在启动 SeaTunnel Engine 时添加 JVM 选项。例如 `seatunnel-cluster.sh -DJvmOption="-Xms2G -Xmx2G"` ## 4. 配置 SeaTunnel Engine SeaTunnel Engine 提供许多功能,需要在 `seatunnel.yaml` 中进行配置。. ### 4.1 Imap中数据的备份数设置 SeaTunnel Engine 基于 [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/) 实现集群管理。集群的状态数据(作业运行状态、资源状态)存储在 [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map)。 存储在 Hazelcast IMap 中的数据将在集群的所有节点上分布和存储。Hazelcast 会分区存储在 Imap 中的数据。每个分区可以指定备份数量。 因此,SeaTunnel Engine 可以实现集群 HA,无需使用其他服务(例如 zookeeper)。 `backup count` 是定义同步备份数量的参数。例如,如果设置为 1,则分区的备份将放置在一个其他成员上。如果设置为 2,则将放置在两个其他成员上。 我们建议 `backup-count` 的值为 `max(1, min(5, N/2))`。 `N` 是集群节点的数量。 ```yaml seatunnel: engine: backup-count: 1 # 其他配置 ``` ### 4.2 Slot配置 Slot数量决定了集群节点可以并行运行的任务组数量。一个任务需要的Slot的个数公式为 N = 2 + P(任务配置的并行度)。 默认情况下SeaTunnel Engine的slot个数为动态,即不限制个数。 我们建议slot的个数设置为节点CPU核心数的2倍, 这也是当 `dynamic-slot` 设置为 false 且未设置 `slot-num` 时的默认值。 动态slot个数(默认)配置如下: ```yaml seatunnel: engine: slot-service: dynamic-slot: true # 其他配置 ``` 静态slot个数配置如下: ```yaml seatunnel: engine: slot-service: dynamic-slot: false slot-num: 20 ``` ### 4.3 检查点管理器 与 Flink 一样,SeaTunnel Engine 支持 Chandy–Lamport 算法。因此,可以实现无数据丢失和重复的数据同步。 **interval** 两个检查点之间的间隔,单位是毫秒。如果在作业配置文件的 `env` 中配置了 `checkpoint.interval` 参数,将以作业配置文件中设置的为准。 **timeout** 检查点的超时时间。如果在超时时间内无法完成检查点,则会触发检查点失败,作业失败。如果在作业的配置文件的`env`中配置了`checkpoint.timeout`参数,将以作业配置文件中设置的为准。 **min-pause** 连续检查点之间的最小暂停时间(以毫秒为单位),确保检查点不会频繁触发。 示例 ```yaml seatunnel: engine: backup-count: 1 print-execution-info-interval: 10 slot-service: dynamic-slot: true checkpoint: interval: 300000 timeout: 10000 min-pause: 5000 ``` **checkpoint storage** 检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。检查点定时触发,每次检查点进行时每个Task都会被要求将自身的状态信息(比如读取kafka时读取到了哪个offset)上报给检查点线程,由该线程写入一个分布式存储(或共享存储)。当任务失败然后自动容错恢复时,或者通过seatunnel.sh -r 指令恢复之前被暂停的任务时,会从检查点存储中加载对应作业的状态信息,并基于这些状态信息进行作业的恢复。 如果集群的节点大于1,检查点存储必须是一个分布式存储,或者共享存储,这样才能保证任意节点挂掉后依然可以在另一个节点加载到存储中的任务状态信息。 有关检查点存储的信息,您可以查看 [Checkpoint Storage](checkpoint-storage.md) ### 4.4 历史作业过期配置 每个完成的作业的信息,如状态、计数器和错误日志,都存储在 IMap 对象中。随着运行作业数量的增加,内存会增加,最终内存将溢出。因此,您可以调整 `history-job-expire-minutes` 参数来解决这个问题。此参数的时间单位是分钟。默认值是 1440 分钟,即一天。 示例 ```yaml seatunnel: engine: history-job-expire-minutes: 1440 ``` ### 4.5 类加载器缓存模式 此配置主要解决不断创建和尝试销毁类加载器所导致的资源泄漏问题。 如果您遇到与metaspace空间溢出相关的异常,您可以尝试启用此配置。 为了减少创建类加载器的频率,在启用此配置后,SeaTunnel 在作业完成时不会尝试释放相应的类加载器,以便它可以被后续作业使用,也就是说,当运行作业中使用的 Source/Sink 连接器类型不是太多时,它更有效。 默认值是 true。 示例 ```yaml seatunnel: engine: classloader-cache-mode: true ``` ### 4.6 作业调度策略 当资源不足时,作业调度策略可以配置为以下两种模式: 1. `WAIT`:等待资源可用。 2. `REJECT`:拒绝作业,默认值。 示例 ```yaml seatunnel: engine: job-schedule-strategy: WAIT ``` 当`dynamic-slot: ture`时,`job-schedule-strategy: WAIT` 配置会失效,将被强制修改为`job-schedule-strategy: REJECT`,因为动态Slot时该参数没有意义,可以直接提交。 ### 4.7 Coordinator Service CoordinatorService 提供了每个作业从 LogicalDag 到 ExecutionDag,再到 PhysicalDag 的生成流程, 并最终创建作业的 JobMaster 进行作业的调度执行和状态监控 **core-thread-num** 配置 CoordinatorService 线程池核心线程数量 **max-thread-num** 同时可执行的最大作业数量 Example ```yaml coordinator-service: core-thread-num: 30 max-thread-num: 1000 ``` ### 4.8 作业指标分区数量(此参数在 Worker 节点上无效) 新的配置选项 JOB_METRICS_PARTITION_COUNT 用于控制在 Hazelcast IMap 中存储运行作业指标时所使用的分区数量。 - 默认值: 1(单个 key,向后兼容) - 用法: 增加该值可以将指标分布到多个分区中,从而在大量任务同时更新指标时减少竞争。 示例: ```yaml seatunnel: engine: job-metrics-partition-count: 4 ``` 上述配置会将指标分布到 4 个分区中,而不是使用单个 key。 当任务数量超过约 20,000 时,增加分区数量可以显著提高性能。 作为实用指导,分区数量约 1,000–2,000 往往在减少锁竞争和最小化开销之间提供最佳平衡。 建议以此值开始,并根据集群规模和工作负载特性进行调整。 注意: 在高并发竞争的情况下,增加分区数量可能会提高并行度;但如果设置过大,会引入额外的分布与合并开销,从而降低整体性能。 分区数量应在作业启动前进行配置。如果在作业已启动后更改,可能导致指标键不匹配,因此建议在修改此选项后重启 SeaTunnel。 ## 5. 配置 SeaTunnel Engine 网络服务 所有 SeaTunnel Engine 网络相关的配置都在 `hazelcast.yaml` 文件中. ### 5.1 集群名称 SeaTunnel Engine 节点使用 `cluster-name` 来确定另一个节点是否与自己在同一集群中。如果两个节点之间的集群名称不同,SeaTunnel 引擎将拒绝服务请求。 ### 5.2 网络 基于 [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), 一个 SeaTunnel Engine 集群是由运行 SeaTunnel Engine 服务器的集群成员组成的网络。 集群成员自动加入一起形成集群。这种自动加入是通过集群成员使用的各种发现机制来相互发现的。 请注意,集群形成后,集群成员之间的通信始终通过 TCP/IP 进行,无论使用的发现机制如何。 SeaTunnel Engine 使用以下发现机制。 #### TCP 您可以将 SeaTunnel Engine 配置为完整的 TCP/IP 集群。有关配置详细信息,请参阅 [Discovering Members By TCP Section](tcp.md)。 一个示例如下 `hazelcast.yaml` ```yaml hazelcast: cluster-name: seatunnel network: join: tcp-ip: enabled: true member-list: - hostname1 port: auto-increment: false port: 5801 properties: hazelcast.logging.type: log4j2 ``` TCP 是我们建议在独立 SeaTunnel Engine 集群中使用的方式。 另一方面,Hazelcast 提供了一些其他的服务发现方法。有关详细信息,请参阅 [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) ### 5.3 IMap持久化配置 在SeaTunnel中,我们使用IMap(一种分布式的Map,可以实现数据跨节点跨进程的写入的读取 有关详细信息,请参阅 [hazelcast map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) 来存储每个任务及其task的状态,以便在任务所在节点宕机后,可以在其他节点上获取到任务之前的状态信息,从而恢复任务实现任务的容错。 默认情况下Imap的信息只是存储在内存中,我们可以设置Imap数据的复本数,具体可参考(4.1 Imap中数据的备份数设置),如果复本数是2,代表每个数据会同时存储在2个不同的节点中。一旦节点宕机,Imap中的数据会重新在其它节点上自动补充到设置的复本数。但是当所有节点都被停止后,Imap中的数据会丢失。当集群节点再次启动后,所有之前正在运行的任务都会被标记为失败,需要用户手工通过seatunnel.sh -r 指令恢复运行。 为了解决这个问题,我们可以将Imap中的数据持久化到外部存储中,如HDFS、OSS等。这样即使所有节点都被停止,Imap中的数据也不会丢失,当集群节点再次启动后,所有之前正在运行的任务都会被自动恢复。 下面介绍如何使用 MapStore 持久化配置。有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) **type** imap 持久化的类型,目前仅支持 `hdfs`。 **namespace** 它用于区分不同业务的数据存储位置,如 OSS 存储桶名称。 **clusterName** 此参数主要用于集群隔离, 我们可以使用它来区分不同的集群,如 cluster1、cluster2,这也用于区分不同的业务。 **fs.defaultFS** 我们使用 hdfs api 读写文件,因此使用此存储需要提供 hdfs 配置。 如果您使用 HDFS,可以像这样配置: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 ``` 如果没有 HDFS,并且您的集群只有一个节点,您可以像这样配置使用本地文件: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: file:/// ``` 如果您使用 OSS,可以像这样配置: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: oss block.size: block size(bytes) oss.bucket: oss://bucket name/ fs.oss.accessKeyId: OSS access key id fs.oss.accessKeySecret: OSS access key secret fs.oss.endpoint: OSS endpoint ``` 注意:使用OSS 时,确保 lib目录下有这几个jar. ``` aliyun-sdk-oss-3.13.2.jar hadoop-aliyun-3.3.6.jar jdom2-2.0.6.jar netty-buffer-4.1.89.Final.jar netty-common-4.1.89.Final.jar seatunnel-hadoop3-3.1.4-uber.jar ``` ## 6. 配置 SeaTunnel Engine 客户端 所有 SeaTunnel Engine 客户端的配置都在 `hazelcast-client.yaml` 里。 ### 6.1 cluster-name 客户端必须与 SeaTunnel Engine 具有相同的 `cluster-name`。否则,SeaTunnel Engine 将拒绝客户端的请求。 ### 6.2 网络 **cluster-members** 需要将所有 SeaTunnel Engine 服务器节点的地址添加到这里。 ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - hostname1:5801 ``` ## 7. 启动 SeaTunnel Engine 服务器节点 可以通过守护进程使用 `-d` 参数启动。 ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d ``` 日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-server.log` ## 8. 提交作业和管理作业 ### 8.1 使用 SeaTunnel Engine 客户端提交作业 #### 安装 SeaTunnel Engine 客户端 您只需将 SeaTunnel Engine 节点上的 `$SEATUNNEL_HOME` 目录复制到客户端节点,并像 SeaTunnel Engine 服务器节点一样配置 `SEATUNNEL_HOME`。 #### 提交作业和管理作业 现在集群部署完成了,您可以通过以下教程完成作业的提交和管理:[提交和管理作业](user-command.md) ### 8.2 使用 REST API 提交作业 SeaTunnel Engine 提供了 REST API 用于提交作业。有关详细信息,请参阅 [REST API V2](rest-api-v2.md) ================================================ FILE: docs/zh/engines/zeta/local-mode-deployment.md ================================================ --- sidebar_position: 4 --- # 以Local模式运行作业 Local模式下每个任务都会启动一个独立的进程,任务运行完成后进程会退出。在该模式下有以下限制: 1. 不支持任务的暂停、恢复。 2. 不支持获取任务列表查看。 3. 不支持通过命令取消作业,只能通过Kill进程的方式终止任务。 但是每个任务由单独的进程控制,不会出现任务之间相互影响的情况,适合对任务稳定性有强烈要求的场景。 ## 本地模式部署SeaTunnel Engine 本地模式下,不需要部署SeaTunnel Engine集群,只需要使用如下命令即可提交作业即可。系统会在提交提交作业的进程中启动SeaTunnel Engine(Zeta)服务来运行提交的作业,作业完成后进程退出。 该模式下只需要将下载和制作好的安装包拷贝到需要运行的服务器上即可,如果需要调整作业运行的JVM参数,可以修改$SEATUNNEL_HOME/config/jvm_client_options文件。 ## 提交作业 ```shell $SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -e local ``` ### 配置本地模式的JVM参数 本地模式支持两种设置JVM参数的方式: 1. 添加JVM参数到`$SEATUNNEL_HOME/config/jvm_client_options`文件中。 修改`$SEATUNNEL_HOME/config/jvm_client_options`文件中的JVM参数。 请注意,该文件中的JVM参数会应用到所有使用`seatunnel.sh`提交的作业。包括Local模式和集群模式。 2. 在启动Local模式时添加JVM参数。例如,`$SEATUNNEL_HOME/bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -m local -DJvmOption="-Xms2G -Xmx2G"` ## 作业运维 Local模式下提交的作业会在提交作业的进程中运行,作业完成后进程会退出,如果要中止作业只需要退出提交作业的进程即可。作业的运行日志会输出到提交作业的进程的标准输出中。 不支持其它运维操作。 ================================================ FILE: docs/zh/engines/zeta/logging.md ================================================ --- sidebar_position: 14 --- # 日志 每个 SeaTunnel Engine 进程都会创建一个日志文件,其中包含该进程中发生的各种事件的消息。这些日志提供了对 SeaTunnel Engine 内部工作原理的深入了解,可用于检测问题(以 WARN/ERROR 消息的形式)并有助于调试问题。 SeaTunnel Engine 中的日志记录使用 SLF4J 日志记录接口。这允许您使用任何支持 SLF4J 的日志记录框架,而无需修改 SeaTunnel Engine 源代码。 默认情况下,Log4j2 用作底层日志记录框架。 ## 结构化信息 SeaTunnel Engine 向大多数相关日志消息的 MDC 添加了以下字段(实验性功能): - Job ID - key: ST-JID - format: string 这在具有结构化日志记录的环境中最为有用,允许您快速过滤相关日志。 MDC 由 slf4j 传播到日志后端,后者通常会自动将其添加到日志记录中(例如,在 log4j json 布局中)。或者,也可以明确配置 - log4j 模式布局可能如下所示: ```properties [%X{ST-JID}] %c{0} %m%n. ``` ## 配置 Log4j2 Log4j2 使用属性文件进行控制。 SeaTunnel Engine 发行版在 `config` 目录中附带以下 log4j 属性文件,如果启用了 Log4j2,则会自动使用这些文件: - `log4j2_client.properties`: 由命令行客户端使用 (例如, `seatunnel.sh`) - `log4j2.properties`: 由 SeaTunnel 引擎服务使用 (例如, `seatunnel-cluster.sh`) 默认情况下,日志文件输出到 `logs` 目录。 Log4j 会定期扫描上述文件以查找更改,并根据需要调整日志记录行为。默认情况下,此检查每 60 秒进行一次,由 Log4j 属性文件中的 monitorInterval 设置控制。 ### 配置作业生成单独的日志文件 要为每个作业输出单独的日志文件,您可以更新 `log4j2.properties` 文件中的以下配置: ```properties ... rootLogger.appenderRef.file.ref = routingAppender ... appender.file.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n ... ``` 此配置为每个作业生成单独的日志文件,例如: ``` job-xxx1.log job-xxx2.log job-xxx3.log ... ``` ### 配置混合日志文件 *默认已采用此配置模式。* 要将所有作业日志输出到 SeaTunnel Engine 系统日志文件中,您可以在 `log4j2.properties` 文件中更新以下配置: ```properties ... rootLogger.appenderRef.file.ref = fileAppender ... appender.file.layout.pattern = [%X{ST-JID}] %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p [%-30.30c{1.}] [%t] - %m%n ... ``` ### 兼容 Log4j1/Logback SeaTunnel Engine 自动集成了大多数 Log 桥接器,允许针对 Log4j1/Logback 类工作的现有应用程序继续工作。 ### REST-API方式查询日志 SeaTunnel 提供了一个 API,用于查询日志。 **使用样例:** - 获取所有节点jobId为`733584788375666689`的日志信息:`http://localhost:8080/logs/733584788375666689` - 获取所有节点日志列表:`http://localhost:8080/logs` - 获取所有节点日志列表以JSON格式返回:`http://localhost:8080/logs?format=json` - 获取日志文件内容:`http://localhost:8080/logs/job-898380162133917698.log` 有关详细信息,请参阅 [REST-API](rest-api-v2.md)。 ## SeaTunnel 日志配置 ### 定时删除旧日志 SeaTunnel 支持定时删除旧日志文件,以避免磁盘空间不足。您可以在 `seatunnel.yml` 文件中添加以下配置: ```yaml seatunnel: engine: history-job-expire-minutes: 1440 telemetry: logs: scheduled-deletion-enable: true ``` - `history-job-expire-minutes`: 设置历史作业和日志的保留时间(单位:分钟)。系统将在指定的时间后自动清除过期的作业信息和日志文件。 - `scheduled-deletion-enable`: 启用定时清理功能,默认为 `true`。系统将在作业达到 `history-job-expire-minutes` 设置的过期时间后自动删除相关日志文件。关闭该功能后,日志将永久保留在磁盘上,需要用户自行管理,否则可能影响磁盘占用。建议根据需求合理配置。 ## 开发人员最佳实践 您可以通过调用 `org.slf4j.LoggerFactory#LoggerFactory.getLogger` 并以您的类的类作为参数来创建 SLF4J 记录器。 当然您也可以使用 lombok 注解 `@Slf4j` 来实现同样的效果 ```java import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TestConnector { private static final Logger LOG = LoggerFactory.getLogger(TestConnector.class); public static void main(String[] args) { LOG.info("Hello world!"); } } ``` 为了最大限度地利用 SLF4J,建议使用其占位符机制。使用占位符可以避免不必要的字符串构造,以防日志级别设置得太高而导致消息无法记录。 占位符的语法如下: ```java LOG.info("This message contains {} placeholders. {}", 1, "key1"); ``` 占位符还可以与需要记录的异常结合使用 ```java try { // some code } catch (Exception e) { LOG.error("An {} occurred", "error", e); } ``` ================================================ FILE: docs/zh/engines/zeta/resource-isolation.md ================================================ --- sidebar_position: 9 --- # 资源隔离 SeaTunnel支持对每个实例添加`tag`, 然后在提交任务时可以在配置文件中使用`tag_filter`来选择任务将要运行的节点. ## 配置 1. 更新`hazelcast.yaml`文件 ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 member-attributes: group: type: string value: platform team: type: string value: team1 ``` 在这个配置中, 我们通过`member-attributes`设置了`group=platform, team=team1`这样两个`tag` 2. 在任务的配置中添加`tag_filter`来选择你需要运行该任务的节点 ```hacon env { parallelism = 1 job.mode = "BATCH" tag_filter { group = "platform" team = "team1" } } source { FakeSource { plugin_output = "fake" parallelism = 1 schema = { fields { name = "string" } } } } transform { } sink { console { plugin_input="fake" } } ``` **注意:** - 当在任务的配置中, 没有添加`tag_filter`时, 会从所有节点中随机选择节点来运行任务. - 当`tag_filter`中存在多个过滤条件时, 会根据key存在以及value相等的全部匹配的节点, 当没有找到匹配的节点时, 会抛出 `NoEnoughResourceException`异常. ![img.png](../../../images/resource-isolation.png) 3. 更新运行中node的tags (可选) 获取具体的使用信息,请参考 [更新运行节点的tags](rest-api-v2.md) ================================================ FILE: docs/zh/engines/zeta/rest-api-v1.md ================================================ # RESTful API V1 :::caution warn 推荐使用v2版本的Rest API。 v1 版本已弃用,并将在将来删除。 我们已经默认关闭了v1版本的API,如果您需要使用v1版本,请在`hazelcast.yaml`文件中启用它。 ::: SeaTunnel有一个用于监控的API,可用于查询运行作业的状态和统计信息,以及最近完成的作业。监控API是RESTful风格的,它接受HTTP请求并使用JSON数据格式进行响应。 ## 概述 监控API是由运行的web服务提供的,它是节点运行的一部分,每个节点成员都可以提供rest API功能。 默认情况下,服务器禁用了RESTful API V1,可以通过在`hazelcast.yaml`文件中设置`rest-api.enabled`配置来启用它。 该服务监听端口为5801,该端口可以在hazelcast.yaml中配置,如下所示: ```yaml network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: true port-count: 100 port: 5801 ``` ## API参考 ### 返回Zeta集群的概览
    GET /hazelcast/rest/maps/overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |--------|------|------|--------------------------| > | tag键值对 | 否 | 字符串 | 一组标签值, 通过该标签值过滤满足条件的节点信息 | #### 响应 ```json { "projectVersion":"2.3.10-SNAPSHOT", "gitCommitAbbrev":"DeadD0d0", "totalSlot":"0", "unassignedSlot":"0", "works":"1", "runningJobs":"0", "finishedJobs":"0", "failedJobs":"0", "cancelledJobs":"0" } ``` **注意:** - 当你使用`dynamic-slot`时, 返回结果中的`totalSlot`和`unassignedSlot`将始终为0. 设置为固定的slot值后, 将正确返回集群中总共的slot数量以及未分配的slot数量. - 当添加标签过滤后, `works`, `totalSlot`, `unassignedSlot`将返回满足条件的节点的相关指标. 注意`runningJobs`等job相关指标为集群级别结果, 无法根据标签进行过滤.
    ------------------------------------------------------------------------------------------ ### 返回当前节点的线程堆栈信息。
    GET /hazelcast/rest/maps/thread-dump (返回当前节点的线程堆栈信息。) #### Parameters #### Responses ```json [ { "threadName": "", "threadId": 0, "threadState": "", "stackTrace": "" } ] ```
    ------------------------------------------------------------------------------------------ ### 返回所有作业及其当前状态的概览
    GET /hazelcast/rest/maps/running-jobs (返回所有作业及其当前状态的概览。) #### 参数 #### 响应 ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "envOptions": { }, "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" } } ] ```
    ------------------------------------------------------------------------------------------ ### 返回作业的详细信息
    GET /hazelcast/rest/maps/job-info/:jobId (返回作业的详细信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|------|------|--------| > | jobId | 是 | long | job id | #### 响应 ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "SourceReceivedCount": "", "SourceReceivedQPS": "", "SourceReceivedBytes": "", "SourceReceivedBytesPerSeconds": "", "SinkWriteCount": "", "SinkWriteQPS": "", "SinkWriteBytes": "", "SinkWriteBytesPerSeconds": "", "SinkCommittedCount": "", "SinkCommittedQPS": "", "SinkCommittedBytes": "", "SinkCommittedBytesPerSeconds": "", "TableSourceReceivedCount": {}, "TableSourceReceivedBytes": {}, "TableSourceReceivedBytesPerSeconds": {}, "TableSourceReceivedQPS": {}, "TableSinkWriteCount": {}, "TableSinkWriteQPS": {}, "TableSinkWriteBytes": {}, "TableSinkWriteBytesPerSeconds": {}, "TableSinkCommittedCount": {}, "TableSinkCommittedQPS": {}, "TableSinkCommittedBytes": {}, "TableSinkCommittedBytesPerSeconds": {} }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 `finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL #### 指标字段说明 | 字段 | 说明 | | --- | --- | | SourceReceivedCount | 源端接收的行数 | | SourceReceivedQPS | 源端接收速率(行/秒) | | SourceReceivedBytes | 源端接收的字节数 | | SourceReceivedBytesPerSeconds | 源端接收速率(字节/秒) | | SinkWriteCount | Sink 写入尝试行数 | | SinkWriteQPS | Sink 写入尝试速率(行/秒) | | SinkWriteBytes | Sink 写入尝试字节数 | | SinkWriteBytesPerSeconds | Sink 写入尝试速率(字节/秒) | | SinkCommittedCount | checkpoint 成功后的 Sink 已提交行数 | | SinkCommittedQPS | Sink 已提交速率(行/秒) | | SinkCommittedBytes | checkpoint 成功后的 Sink 已提交字节数 | | SinkCommittedBytesPerSeconds | Sink 已提交速率(字节/秒) | | TableSourceReceived* | 按表汇总的源指标,键格式 `TableSourceReceivedXXX#<表>` | | TableSinkWrite* | 按表汇总的 Sink 写入尝试,键格式 `TableSinkWriteXXX#<表>` | | TableSinkCommitted* | 按表汇总的 Sink 已提交指标,键格式 `TableSinkCommittedXXX#<表>` | 当我们查询不到这个Job时,返回结果为: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### 返回作业的详细信息 此API已经弃用,请使用/hazelcast/rest/maps/job-info/:jobId替代。
    GET /hazelcast/rest/maps/running-job/:jobId (返回作业的详细信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|------|------|--------| > | jobId | 是 | long | job id | #### 响应 ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 `finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL 当我们查询不到这个Job时,返回结果为: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### 返回所有已完成的作业信息
    GET /hazelcast/rest/maps/finished-jobs/:state (返回所有已完成的作业信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|----------|--------|-----------------------------------------------------------------------------------| > | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`SAVEPOINT_DONE`,`UNKNOWABLE` | #### 响应 ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "errorMsg": null, "createTime": "", "finishTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": "" } ] ```
    ------------------------------------------------------------------------------------------ ### 返回系统监控信息
    GET /hazelcast/rest/maps/system-monitoring-information (返回系统监控信息。) #### 参数 #### 响应 ```json [ { "isMaster": "true", "host": "localhost", "port": "5801", "processors":"8", "physical.memory.total":"16.0G", "physical.memory.free":"16.3M", "swap.space.total":"0", "swap.space.free":"0", "heap.memory.used":"135.7M", "heap.memory.free":"440.8M", "heap.memory.total":"576.5M", "heap.memory.max":"3.6G", "heap.memory.used/total":"23.54%", "heap.memory.used/max":"3.73%", "minor.gc.count":"6", "minor.gc.time":"110ms", "major.gc.count":"2", "major.gc.time":"73ms", "load.process":"24.78%", "load.system":"60.00%", "load.systemAverage":"2.07", "thread.count":"117", "thread.peakCount":"118", "cluster.timeDiff":"0", "event.q.size":"0", "executor.q.async.size":"0", "executor.q.client.size":"0", "executor.q.client.query.size":"0", "executor.q.client.blocking.size":"0", "executor.q.query.size":"0", "executor.q.scheduled.size":"0", "executor.q.io.size":"0", "executor.q.system.size":"0", "executor.q.operations.size":"0", "executor.q.priorityOperation.size":"0", "operations.completed.count":"10", "executor.q.mapLoad.size":"0", "executor.q.mapLoadAllKeys.size":"0", "executor.q.cluster.size":"0", "executor.q.response.size":"0", "operations.running.count":"0", "operations.pending.invocations.percentage":"0.00%", "operations.pending.invocations.count":"0", "proxy.count":"8", "clientEndpoint.count":"0", "connection.active.count":"2", "client.connection.count":"0", "connection.count":"0" } ] ```
    ------------------------------------------------------------------------------------------ ### 提交作业
    POST /hazelcast/rest/maps/submit-job (如果作业提交成功,返回jobId和jobName。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|--------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### 请求体 ```json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ``` #### 响应 ```json { "jobId": 733584788375666689, "jobName": "rest_api_test" } ```
    ------------------------------------------------------------------------------------------ ### 批量提交作业
    POST /hazelcast/rest/maps/submit-jobs (如果作业提交成功,返回jobId和jobName。) #### 参数(在请求体中params字段中添加) > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|--------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### 请求体 ```json [ { "params":{ "jobId":"123456", "jobName":"SeaTunnel-01" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] }, { "params":{ "jobId":"1234567", "jobName":"SeaTunnel-02" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ] ``` #### 响应 ```json [ { "jobId": "123456", "jobName": "SeaTunnel-01" },{ "jobId": "1234567", "jobName": "SeaTunnel-02" } ] ```
    ------------------------------------------------------------------------------------------ ### 停止作业
    POST /hazelcast/rest/maps/stop-job (如果作业成功停止,返回jobId。) #### 参数 | 参数名称 | 是否必传 | 参数类型 | 参数描述 | |------------------------|----------|----------|----------| | jobId | yes | long | 作业 ID | | isStopWithSavePoint | no | boolean | 是否通过 savepoint 方式停止作业 | | force | no | boolean | 是否强制停止作业(忽略 isStopWithSavePoint 参数) | #### 请求体 ```json { "jobId": 733584788375666689, "isStopWithSavePoint": false, "force": false } ``` #### 响应 ```json { "jobId": 733584788375666689 } ``` **Notes(注意事项):** - 如果作业状态为 DOING_SAVEPOINT 且保存点未成功完成,在启用 force 选项时执行的强制停止操作会将作业状态设置为 CANCELED。 - 强制停止可能导致检查点数据不完整或处于不一致状态,仅应在异常或非正常情况下使用。
    ------------------------------------------------------------------------------------------ ### 批量停止作业
    POST /hazelcast/rest/maps/stop-jobs (如果作业成功停止,返回jobId。) #### 请求体 ```json [ { "jobId": 881432421482889220, "isStopWithSavePoint": false, "force": false }, { "jobId": 881432456517910529, "isStopWithSavePoint": false, "force": false } ] ``` #### 响应 ```json [ { "jobId": 881432421482889220 }, { "jobId": 881432456517910529 } ] ```
    ------------------------------------------------------------------------------------------ ### 加密配置
    POST /hazelcast/rest/maps/encrypt-config (如果配置加密成功,则返回加密后的配置。) 有关自定义加密的更多信息,请参阅文档[配置-加密-解密](../../introduction/concepts/config-encryption-decryption.md). #### 请求体 ```json { "env": { "parallelism": 1, "shade.identifier":"base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema" : { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "seatunnel", "password": "seatunnel_password", "table-name": "inventory_vwyw0n" } ], "transform": [ ], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "seatunnel", "password": "seatunnel_password" } ] } ``` #### 响应 ```json { "env": { "parallelism": 1, "shade.identifier": "base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema": { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", "table-name": "inventory_vwyw0n" } ], "transform": [], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" } ] } ```
    ------------------------------------------------------------------------------------------ ### 更新运行节点的tags
    POST/hazelcast/rest/maps/update-tags因为更新只能针对于某个节点,因此需要用当前节点ip:port用于更新(如果更新成功,则返回"success"信息) #### 更新节点tags ##### 请求体 如果请求参数是`Map`对象,表示要更新当前节点的tags ```json { "tag1": "dev_1", "tag2": "dev_2" } ``` ##### 响应 ```json { "status": "success", "message": "update node tags done." } ``` #### 移除节点tags ##### 请求体 如果参数为空`Map`对象,表示要清除当前节点的tags ```json {} ``` ##### 响应 响应体将为: ```json { "status": "success", "message": "update node tags done." } ``` #### 请求参数异常 - 如果请求参数为空 ##### 响应 ```json { "status": "fail", "message": "Request body is empty." } ``` - 如果参数不是`Map`对象 ##### 响应 ```json { "status": "fail", "message": "Invalid JSON format in request body." } ```
    ------------------------------------------------------------------------------------------ ### 获取所有节点日志内容
    GET /hazelcast/rest/maps/logs/:jobId (返回日志列表。) #### 请求参数 #### 参数(在请求体中params字段中添加) > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|--------|-----------------------------------| > | jobId | optional | string | job id | 当`jobId`为空时,返回所有节点的日志信息,否则返回指定`jobId`在所有节点的的日志列表。 #### 响应 返回请求节点的日志列表、内容 #### 返回所有日志文件列表 如果你想先查看日志列表,可以通过`GET`请求获取日志列表,`http://localhost:5801/hazelcast/rest/maps/logs?format=json` ```json [ { "node": "localhost:5801", "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899485770241277953.log", "logName": "job-899485770241277953.log" }, { "node": "localhost:5801", "logLink": "http://localhost:5801/hazelcast/rest/maps/logs/job-899470314109468673.log", "logName": "job-899470314109468673.log" } ] ``` 当前支持的格式有`json`和`html`,默认为`html`。 #### 例子 获取所有节点jobId为`733584788375666689`的日志信息:`http://localhost:5801/hazelcast/rest/maps/logs/733584788375666689` 获取所有节点日志列表:`http://localhost:5801/hazelcast/rest/maps/logs` 获取所有节点日志列表以JSON格式返回:`http://localhost:5801/hazelcast/rest/maps/logs?format=json` 获取日志文件内容:`http://localhost:5801/hazelcast/rest/maps/logs/job-898380162133917698.log``
    ### 获取单节点日志内容
    GET /hazelcast/rest/maps/log (返回日志列表。) #### 响应 返回请求节点的日志列表 #### 例子 获取当前节点的日志列表:`http://localhost:5801/hazelcast/rest/maps/log` 获取日志文件内容:`http://localhost:5801/hazelcast/rest/maps/log/job-898380162133917698.log`
    ================================================ FILE: docs/zh/engines/zeta/rest-api-v2.md ================================================ # RESTful API V2 SeaTunnel有一个用于监控的API,可用于查询运行作业的状态和统计信息,以及最近完成的作业。监控API是RESTful风格的,它接受HTTP请求并使用JSON数据格式进行响应。 ## 概述 v2版本的api使用jetty支持,与v1版本的接口规范相同 ,可以通过修改`seatunnel.yaml`中的配置项来指定端口和context-path, 同时可以配置 `enable-dynamic-port` 开启动态端口(默认从 `port` 开始累加),默认为开启, 如果`enable-dynamic-port`为`true`,我们将使用`port`和`port`+`port-range`范围内未使用的端口,默认范围是100。 ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-dynamic-port: true port-range: 100 ``` 同时也可以配置context-path,配置如下: ```yaml seatunnel: engine: http: enable-http: true port: 8080 context-path: /seatunnel ``` ## 开启 HTTPS 请参考 [security](security.md) ## API参考 ### 返回Zeta集群的概览
    GET /overview?tag1=value1&tag2=value2 (Returns an overview over the Zeta engine cluster.) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |--------|------|------|--------------------------| > | tag键值对 | 否 | 字符串 | 一组标签值, 通过该标签值过滤满足条件的节点信息 | #### 响应 ```json { "projectVersion":"2.3.10-SNAPSHOT", "gitCommitAbbrev":"DeadD0d0", "totalSlot":"0", "unassignedSlot":"0", "works":"1", "runningJobs":"0", "pendingJobs":"0", "finishedJobs":"0", "failedJobs":"0", "cancelledJobs":"0" } ``` **注意:** - 当你使用`dynamic-slot`时, 返回结果中的`totalSlot`和`unassignedSlot`将始终为0. 设置为固定的slot值后, 将正确返回集群中总共的slot数量以及未分配的slot数量. - 当添加标签过滤后, `works`, `totalSlot`, `unassignedSlot`将返回满足条件的节点的相关指标. 注意`runningJobs`等job相关指标为集群级别结果, 无法根据标签进行过滤.
    ------------------------------------------------------------------------------------------ ### 查询作业及其当前状态的概览
    GET /running-jobs?page=1&rows=10 (查询作业及其当前状态的概览。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |------|------|------|------| > | page | 否 | int | 页号 | > | rows | 否 | int | 每页行数 | #### 响应 ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" } } ] ```
    ------------------------------------------------------------------------------------------ ### 查看 Pending 队列详细信息
    GET /pending-jobs?jobId=123&limit=10 (用于排查作业长时间处于 PENDING 的原因。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 描述 | > |----------|----------|----------|--------------------------------| > | jobId | 可选 | long | 只查看指定作业的诊断信息。当同时提供 `jobId` 和 `limit` 时,`jobId` 优先生效,`limit` 将被忽略。 | > | limit | 可选 | integer | 限制返回的PENDING作业数量。当提供 `jobId` 参数时此参数将被忽略。 | > | pretty | 可选 | boolean | 传入 `true` 时返回格式化 JSON,并格式化时间戳。 | #### 响应 ```json { "queueSummary": { "size": 2, "scheduleStrategy": "WAIT", "oldestEnqueueTimestamp": 1717500000000, "newestEnqueueTimestamp": 1717500005000, "lackingTaskGroups": 6 }, "clusterSnapshot": { "totalSlots": 8, "freeSlots": 1, "assignedSlots": 7, "workerCount": 2, "workers": [ { "address": "10.0.0.8:5801", "tags": { "zone": "az1" }, "totalSlots": 4, "freeSlots": 0, "dynamicSlot": false, "cpuUsage": 0.83, "memUsage": 0.64, "runningJobIds": [ 1001, 1002 ] } ] }, "pendingJobs": [ { "jobId": 1003, "jobName": "cdc_mysql_to_es", "pendingSourceState": "SUBMIT", "jobStatus": "PENDING", "enqueueTimestamp": 1717500000000, "checkTime": 1717500005000, "waitDurationMs": 5000, "checkCount": 3, "totalTaskGroups": 16, "allocatedTaskGroups": 10, "lackingTaskGroups": 6, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: can't apply resource request", "tagFilter": {}, "blockingJobIds": [ 1001 ], "pipelines": [ { "pipelineId": 1, "pipelineName": "Job job-name, Pipeline: [(1/2)]", "totalTaskGroups": 8, "allocatedTaskGroups": 5, "lackingTaskGroups": 3, "taskGroupDiagnostics": [ { "taskGroupLocation": { "jobId": 1003, "pipelineId": 1, "taskGroupId": 1 }, "taskFullName": "Source[0]", "allocated": false, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: slot not enough" } ] } ], "lackingTaskGroupDiagnostics": [ { "taskGroupLocation": { "jobId": 1003, "pipelineId": 1, "taskGroupId": 1 }, "taskFullName": "Source[0]", "allocated": false, "failureReason": "REQUEST_FAILED", "failureMessage": "NoEnoughResourceException: slot not enough" } ] } ] } ``` 当 `pretty=true` 时,接口会返回格式化后的 JSON,并把 `oldestEnqueueTimestamp`、`newestEnqueueTimestamp`、`enqueueTimestamp`、`checkTime` 转为 `yyyy-MM-dd HH:mm:ss` 字符串,方便排查。 响应中包含: - **queueSummary**:Pending 队列整体信息总结 - `size`:当前排队的 Job 数量。 - `scheduleStrategy`:调度策略,决定资源不足时的处理方式。 - `oldestEnqueueTimestamp` / `newestEnqueueTimestamp`:最久/最新进入 Pending 队列 Job 的时间戳(毫秒)。 - `lackingTaskGroups`:尚未分配 Slot 的 TaskGroup 数量。**注意**:该值仅统计当前响应中返回的作业子集(即受 `limit` 参数限制或 `jobId` 过滤后的作业),而非整个 Pending 队列的完整统计。如需查看所有 Pending 作业的完整统计信息,请不带 `limit` 参数调用此接口。 - **clusterSnapshot**:当前集群的资源视图。 - `totalSlots` / `assignedSlots` / `freeSlots`:Slot 总数、已分配数、剩余数。 - `workerCount`:Worker 数量。 - `workers[]`: - `address`:Worker 地址(host:port)。 - `tags`:Worker 自带的标签。 - `totalSlots` / `freeSlots`:Worker 的 Slot 总数与剩余数。 - `dynamicSlot`:是否启用动态 Slot。 - `cpuUsage` / `memUsage`:系统负载采样(只有当 `slot-allocate-strategy: SYSTEM_LOAD` 才会有该值) - `runningJobIds[]`:当前占用 Worker Slot 的 JobId 列表。 - **pendingJobs[]**:队列中的每个 Job 的诊断信息。 - `jobId` / `jobName`:作业标识。 - `pendingSourceState`:取值:`SUBMIT`,`RESTORE`。 - `jobStatus`:物理计划记录的状态(固定为 `PENDING`)。 - `enqueueTimestamp`:进入 Pending 队列的时间。 - `checkTime`:最近一次Pending检查时间。 - `waitDurationMs`:等待时长(`checkTime - enqueueTimestamp`)。 - `checkCount`:已被调度线程检查的次数。 - `totalTaskGroups` / `allocatedTaskGroups` / `lackingTaskGroups`:Job 全部 TaskGroup 数量、已分配 Slot 的数量、缺少 Slot 的数量。 - `failureReason` / `failureMessage`:导致本次资源申请失败的归类及具体信息(如 `RESOURCE_NOT_ENOUGH`、`REQUEST_FAILED` 等)。 - `tagFilter`:Job 要求的 Worker 标签(若配置)。 - `blockingJobIds[]`:当前占用 Slot 的其他 JobId,用来分析资源竞争。 - `pipelines[]`:按 Pipeline 细分: - `pipelineId` / `pipelineName`: - `totalTaskGroups` / `allocatedTaskGroups` / `lackingTaskGroups`:Pipeline 里 TaskGroup 的总数、已分配 Slot 数量、缺少 Slot 的数量。 - `taskGroupDiagnostics[]`:每个 TaskGroup 的 Slot 请求状态: - `taskGroupLocation`(`jobId`, `pipelineId`, `taskGroupId`)。 - `taskFullName`:方便直接定位 source/sink。 - `allocated`:是否已经成功申请 Slot。 - `failureReason` / `failureMessage`:TaskGroup 层面的失败原因。 - `lackingTaskGroupDiagnostics[]`:聚合所有 `allocated=false` 的 TaskGroup,方便快速查看缺 Slot 的具体任务。
    ------------------------------------------------------------------------------------------ ### 返回作业的详细信息
    GET /job-info/:jobId (返回作业的详细信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|------|------|--------| > | jobId | 是 | long | job id | #### 响应 ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "IntermediateQueueSize": "", "SourceReceivedCount": "", "SourceReceivedQPS": "", "SourceReceivedBytes": "", "SourceReceivedBytesPerSeconds": "", "SinkWriteCount": "", "SinkWriteQPS": "", "SinkWriteBytes": "", "SinkWriteBytesPerSeconds": "", "SinkCommittedCount": "", "SinkCommittedQPS": "", "SinkCommittedBytes": "", "SinkCommittedBytesPerSeconds": "", "TableSourceReceivedCount": {}, "TableSourceReceivedBytes": {}, "TableSourceReceivedBytesPerSeconds": {}, "TableSourceReceivedQPS": {}, "TableSinkWriteCount": {}, "TableSinkWriteQPS": {}, "TableSinkWriteBytes": {}, "TableSinkWriteBytesPerSeconds": {}, "TableSinkCommittedCount": {}, "TableSinkCommittedQPS": {}, "TableSinkCommittedBytes": {}, "TableSinkCommittedBytesPerSeconds": {} }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 `finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL #### 指标字段说明 | 字段 | 说明 | | --- | --- | | IntermediateQueueSize | 中间队列的大小 | | SourceReceivedCount | 源端接收的行数 | | SourceReceivedQPS | 源端接收速率(行/秒) | | SourceReceivedBytes | 源端接收的字节数 | | SourceReceivedBytesPerSeconds | 源端接收速率(字节/秒) | | SinkWriteCount | Sink 写入尝试行数 | | SinkWriteQPS | Sink 写入尝试速率(行/秒) | | SinkWriteBytes | Sink 写入尝试字节数 | | SinkWriteBytesPerSeconds | Sink 写入尝试速率(字节/秒) | | SinkCommittedCount | checkpoint 成功后的 Sink 已提交行数 | | SinkCommittedQPS | Sink 已提交速率(行/秒) | | SinkCommittedBytes | checkpoint 成功后的 Sink 已提交字节数 | | SinkCommittedBytesPerSeconds | Sink 已提交速率(字节/秒) | | TableSourceReceived* | 按表汇总的源指标,键格式 `TableSourceReceivedXXX#<表>` | | TableSinkWrite* | 按表汇总的 Sink 写入尝试,键格式 `TableSinkWriteXXX#<表>` | | TableSinkCommitted* | 按表汇总的 Sink 已提交指标,键格式 `TableSinkCommittedXXX#<表>` | 当我们查询不到这个Job时,返回结果为: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### 返回作业的详细信息 此API已经弃用,请使用/job-info/:jobId替代。
    GET /running-job/:jobId (返回作业的详细信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|------|------|--------| > | jobId | 是 | long | job id | #### 响应 ```json { "jobId": "", "jobName": "", "jobStatus": "", "createTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": { "sourceReceivedCount": "", "sinkWriteCount": "" }, "finishedTime": "", "errorMsg": null, "envOptions": { }, "pluginJarsUrls": [ ], "isStartWithSavePoint": false } ``` `jobId`, `jobName`, `jobStatus`, `createTime`, `jobDag`, `metrics` 字段总会返回. `envOptions`, `pluginJarsUrls`, `isStartWithSavePoint` 字段在Job在RUNNING状态时会返回 `finishedTime`, `errorMsg` 字段在Job结束时会返回,结束状态为不为RUNNING,可能为FINISHED,可能为CANCEL 当我们查询不到这个Job时,返回结果为: ```json { "jobId" : "" } ```
    ------------------------------------------------------------------------------------------ ### 查询已完成的作业信息
    GET /finished-jobs/:state?page=1&rows=10 (查询已完成的作业信息。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |-------|----------|--------|-----------------------------------------------------------------------------------| > | state | optional | string | finished job status. `FINISHED`,`CANCELED`,`FAILED`,`SAVEPOINT_DONE`,`UNKNOWABLE` | > | page | 否 | int | 页号 | > | rows | 否 | int | 每页行数 | #### 响应 ```json [ { "jobId": "", "jobName": "", "jobStatus": "", "errorMsg": null, "createTime": "", "finishTime": "", "jobDag": { "jobId": "", "envOptions": [], "vertexInfoMap": [ { "vertexId": 1, "type": "", "vertexName": "", "tablePaths": [ "" ] } ], "pipelineEdges": {} }, "metrics": "" } ] ```
    ------------------------------------------------------------------------------------------ ### 返回系统监控信息
    GET /system-monitoring-information (返回系统监控信息。) #### 参数 #### 响应 ```json [ { "processors":"8", "physical.memory.total":"16.0G", "physical.memory.free":"16.3M", "swap.space.total":"0", "swap.space.free":"0", "heap.memory.used":"135.7M", "heap.memory.free":"440.8M", "heap.memory.total":"576.5M", "heap.memory.max":"3.6G", "heap.memory.used/total":"23.54%", "heap.memory.used/max":"3.73%", "minor.gc.count":"6", "minor.gc.time":"110ms", "major.gc.count":"2", "major.gc.time":"73ms", "load.process":"24.78%", "load.system":"60.00%", "load.systemAverage":"2.07", "thread.count":"117", "thread.peakCount":"118", "cluster.timeDiff":"0", "event.q.size":"0", "executor.q.async.size":"0", "executor.q.client.size":"0", "executor.q.client.query.size":"0", "executor.q.client.blocking.size":"0", "executor.q.query.size":"0", "executor.q.scheduled.size":"0", "executor.q.io.size":"0", "executor.q.system.size":"0", "executor.q.operations.size":"0", "executor.q.priorityOperation.size":"0", "operations.completed.count":"10", "executor.q.mapLoad.size":"0", "executor.q.mapLoadAllKeys.size":"0", "executor.q.cluster.size":"0", "executor.q.response.size":"0", "operations.running.count":"0", "operations.pending.invocations.percentage":"0.00%", "operations.pending.invocations.count":"0", "proxy.count":"8", "clientEndpoint.count":"0", "connection.active.count":"2", "client.connection.count":"0", "connection.count":"0" } ] ```
    ------------------------------------------------------------------------------------------ ### 提交作业
    POST /submit-job (如果作业提交成功,返回jobId和jobName。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|-----------------------------------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | > | format | optional | string | 配置风格,支持json、hocon 和 sql,默认 json | #### 请求体 你可以选择用json、hocon或者sql的方式来传递请求体。 Json请求示例: ```json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ``` Hocon请求示例: ```hocon env { job.mode = "batch" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } transform { } sink { Console { plugin_input = "fake" } } ``` SQL请求示例: ```sql /* config env { parallelism = 2 job.mode = "BATCH" } */ CREATE TABLE fake_source ( id INT, name STRING, age INT ) WITH ( 'connector' = 'FakeSource', 'rows' = '[ { fields = [1, "Alice", 25], kind = INSERT }, { fields = [2, "Bob", 30], kind = INSERT } ]', 'schema' = '{ fields { id = "int", name = "string", age = "int" } }', 'type' = 'source' ); CREATE TABLE console_sink ( id INT, name STRING, age INT ) WITH ( 'connector' = 'Console', 'type' = 'sink' ); INSERT INTO console_sink SELECT * FROM fake_source; ``` #### 响应 ```json { "jobId": 733584788375666689, "jobName": "rest_api_test" } ```
    ------------------------------------------------------------------------------------------ ### 提交作业来源上传配置文件
    POST /submit-job (如果作业提交成功,返回jobId和jobName。) #### 参数 > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|-----------------------------------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### 请求体 上传文件key的名称是config_file,支持以下格式: - `.json` 文件:按照 JSON 格式解析 - `.conf` 或 `.config` 文件:按照 HOCON 格式解析 - `.sql` 文件:按照 SQL 格式解析,支持 CREATE TABLE 和 INSERT INTO 语法 curl Example ```bash # 上传 HOCON 配置文件 curl --location 'http://127.0.0.1:8080/submit-job/upload' --form 'config_file=@"/temp/fake_to_console.conf"' # 上传 SQL 配置文件 curl --location 'http://127.0.0.1:8080/submit-job/upload' --form 'config_file=@"/temp/job.sql"' ``` #### 响应 ```json { "jobId": 733584788375666689, "jobName": "SeaTunnel_Job" } ```
    ------------------------------------------------------------------------------------------ ### 批量提交作业
    POST /submit-jobs (如果作业提交成功,返回jobId和jobName。) #### 参数(在请求体中params字段中添加) > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|--------|-----------------------------------| > | jobId | optional | string | job id | > | jobName | optional | string | job name | > | isStartWithSavePoint | optional | string | if job is started with save point | #### 请求体 ```json [ { "params":{ "jobId":"123456", "jobName":"SeaTunnel-01" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] }, { "params":{ "jobId":"1234567", "jobName":"SeaTunnel-02" }, "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 1000, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ ], "sink": [ { "plugin_name": "Console", "plugin_input": ["fake"] } ] } ] ``` #### 响应 ```json [ { "jobId": "123456", "jobName": "SeaTunnel-01" },{ "jobId": "1234567", "jobName": "SeaTunnel-02" } ] ```
    ------------------------------------------------------------------------------------------ ### 停止作业
    POST /stop-job (如果作业成功停止,返回jobId。) #### 参数 | 参数名称 | 是否必传 | 参数类型 | 参数描述 | |------------------------|----------|----------|----------| | jobId | yes | long | 作业 ID | | isStopWithSavePoint | no | boolean | 是否通过 savepoint 方式停止作业 | | force | no | boolean | 是否强制停止作业(忽略 isStopWithSavePoint 参数) | #### 请求体 ```json { "jobId": 733584788375666689, "isStopWithSavePoint": false, "force": false } ``` #### 响应 ```json { "jobId": 733584788375666689 } ``` **Notes(注意事项):** - 如果作业状态为 DOING_SAVEPOINT 且保存点未成功完成,在启用 force 选项时执行的强制停止操作会将作业状态设置为 CANCELED。 - 强制停止可能导致检查点数据不完整或处于不一致状态,仅应在异常或非正常情况下使用。
    ------------------------------------------------------------------------------------------ ### 批量停止作业
    POST /stop-jobs (如果作业成功停止,返回jobId。) #### 请求体 ```json [ { "jobId": 881432421482889220, "isStopWithSavePoint": false, "force": false }, { "jobId": 881432456517910529, "isStopWithSavePoint": false, "force": false } ] ``` #### 响应 ```json [ { "jobId": 881432421482889220 }, { "jobId": 881432456517910529 } ] ```
    ------------------------------------------------------------------------------------------ ### 加密配置
    POST /encrypt-config (如果配置加密成功,则返回加密后的配置。) 有关自定义加密的更多信息,请参阅文档[配置-加密-解密](../../introduction/concepts/config-encryption-decryption.md). #### 请求体 ```json { "env": { "parallelism": 1, "shade.identifier":"base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema" : { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "seatunnel", "password": "seatunnel_password", "table-name": "inventory_vwyw0n" } ], "transform": [ ], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "seatunnel", "password": "seatunnel_password" } ] } ``` #### 响应 ```json { "env": { "parallelism": 1, "shade.identifier": "base64" }, "source": [ { "plugin_name": "MySQL-CDC", "schema": { "fields": { "name": "string", "age": "int" } }, "plugin_output": "fake", "parallelism": 1, "hostname": "127.0.0.1", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk", "table-name": "inventory_vwyw0n" } ], "transform": [], "sink": [ { "plugin_name": "Clickhouse", "host": "localhost:8123", "database": "default", "table": "fake_all", "username": "c2VhdHVubmVs", "password": "c2VhdHVubmVsX3Bhc3N3b3Jk" } ] } ```
    ------------------------------------------------------------------------------------------ ### 更新运行节点的tags
    POST/update-tags因为更新只能针对于某个节点,因此需要用当前节点ip:port用于更新(如果更新成功,则返回"success"信息) #### 更新节点tags ##### 请求体 如果请求参数是`Map`对象,表示要更新当前节点的tags ```json { "tag1": "dev_1", "tag2": "dev_2" } ``` ##### 响应 ```json { "status": "success", "message": "update node tags done." } ``` #### 移除节点tags ##### 请求体 如果参数为空`Map`对象,表示要清除当前节点的tags ```json {} ``` ##### 响应 响应体将为: ```json { "status": "success", "message": "update node tags done." } ``` #### 请求参数异常 - 如果请求参数为空 ##### 响应 ```json { "status": "fail", "message": "Request body is empty." } ``` - 如果参数不是`Map`对象 ##### 响应 ```json { "status": "fail", "message": "Invalid JSON format in request body." } ```
    ------------------------------------------------------------------------------------------ ### 获取所有节点日志内容
    GET /logs/:jobId (返回日志列表。) #### 请求参数 #### 参数(在请求体中params字段中添加) > | 参数名称 | 是否必传 | 参数类型 | 参数描述 | > |----------------------|----------|--------|-----------------------------------| > | jobId | optional | string | job id | 当`jobId`为空时,返回所有节点的日志信息,否则返回指定`jobId`在所有节点的的日志列表。 #### 响应 返回请求节点的日志列表、内容 #### 返回所有日志文件列表 如果你想先查看日志列表,可以通过`GET`请求获取日志列表,`http://localhost:8080/logs?format=json` ```json [ { "node": "localhost:8080", "logLink": "http://localhost:8080/logs/job-899485770241277953.log", "logName": "job-899485770241277953.log" }, { "node": "localhost:8080", "logLink": "http://localhost:8080/logs/job-899470314109468673.log", "logName": "job-899470314109468673.log" } ] ``` 当前支持的格式有`json`和`html`,默认为`html`。 #### 例子 获取所有节点jobId为`733584788375666689`的日志信息:`http://localhost:8080/logs/733584788375666689` 获取所有节点日志列表:`http://localhost:8080/logs` 获取所有节点日志列表以JSON格式返回:`http://localhost:8080/logs?format=json` 获取日志文件内容:`http://localhost:8080/logs/job-898380162133917698.log`
    ### 获取单节点日志内容
    GET /log (返回日志列表。) #### 响应 返回请求节点的日志列表 #### 例子 获取当前节点的日志列表:`http://localhost:5801/log` 获取日志文件内容:`http://localhost:5801/log/job-898380162133917698.log``
    ### 获取节点指标信息
    GET /metrics GET /openmetrics 你需要先打开`Telemetry`才能获取集群指标信息。否则将返回空信息。 更多关于`Telemetry`的信息可以在[Telemetry](telemetry.md)文档中找到。
    ### 获取作业 Checkpoint 概览
    GET /jobs/checkpoints/:jobId (返回指定作业下所有 Pipeline 的 Checkpoint 概览。) #### 参数 路径参数 `jobId`:必填,作业 ID。 #### 响应示例 ```json { "jobId": "1234567890", "updatedAt": 1720000000123, "pipelines": [ { "pipelineId": 1, "counts": { "triggered": 10, "completed": 8, "failed": 1, "inProgress": 1, "restored": 2 }, "latestCompleted": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 }, "latestFailed": { "checkpointId": 8, "checkpointType": "CHECKPOINT_TYPE", "status": "FAILED", "triggerTimestamp": 1719999995000, "failureReason": "CHECKPOINT_EXPIRED" }, "latestSavepoint": null, "inProgress": [ { "checkpointId": 10, "checkpointType": "CHECKPOINT_TYPE", "triggerTimestamp": 1720000005000, "acknowledged": 2, "total": 4 } ], "history": [ { "pipelineId": 1, "checkpoint": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 } } ] } ] } ```
    #### 字段说明 | 字段 | 描述 | | --- | --- | | `jobId` | 作业 ID。 | | `updatedAt` | 概览最近刷新时间(毫秒时间戳)。 | | `pipelines` | pipeline 统计列表。 | | `pipelines[].pipelineId` | pipeline ID。 | | `pipelines[].counts.triggered/completed/failed/inProgress/restored` | Checkpoint 统计:
    - `triggered`:自作业启动以来触发次数。
    - `completed`:成功完成次数。
    - `failed`:失败次数。
    - `inProgress`:当前正在执行的 checkpoint 数量。
    - `restored`:触发恢复(包括 savepoint 恢复)的次数。 | | `pipelines[].latestCompleted/latestFailed/latestSavepoint` | 最近一次成功/失败/保存点 checkpoint 元信息(字段同“Checkpoint 信息字段”表)。 | | `pipelines[].inProgress` | 进行中的 checkpoint 列表,如下所示:
    - `checkpointId`:当前执行中的 checkpoint 编号。
    - `checkpointType`:类型(普通 checkpoint、savepoint 等)。
    - `triggerTimestamp`:该 checkpoint 触发时间(毫秒)。
    - `acknowledged`:已完成 ACK 的 subtask 数。
    - `total`:该 pipeline 中需要 ACK 的 subtask 总数。 | | `pipelines[].history` | 环形缓冲中的历史记录(默认保留 32 条),每条包含 `pipelineId` 和对应的 checkpoint 元信息,按触发时间倒序。 | Checkpoint 信息字段: | 字段 | 描述 | | --- |-----------------------------------------| | `checkpointId` | checkpoint 编号。 | | `checkpointType` | checkpoint 类型。 | | `status` | 状态:`COMPLETED` / `FAILED` / `CANCELED`。 | | `triggerTimestamp` | 触发时间(毫秒)。 | | `completedTimestamp` | 完成时间(毫秒,成功时存在)。 | | `durationMillis` | 耗时(毫秒)。 | | `stateSize` | 状态大小(字节)。 | | `failureReason` | 失败/取消原因,可能为空。 | ### 获取作业 Checkpoint 历史
    GET /jobs/checkpoints/history/:jobId (返回作业的 Checkpoint 历史记录。) #### 参数 | 参数 | 说明 | | --- | --- | | `jobId` | 必填,作业 ID。 | | `pipelineId` | 可选,按 pipeline 过滤。 | | `limit` | 可选,限制返回条数,默认 20。 | | `status` | 可选,支持 `COMPLETED`、`FAILED`、`CANCELED`。 | #### 响应示例 ```json [ { "pipelineId": 1, "checkpoint": { "checkpointId": 9, "checkpointType": "CHECKPOINT_TYPE", "status": "COMPLETED", "triggerTimestamp": 1720000000000, "completedTimestamp": 1720000000450, "durationMillis": 450, "stateSize": 128934 } }, { "pipelineId": 1, "checkpoint": { "checkpointId": 8, "checkpointType": "CHECKPOINT_TYPE", "status": "FAILED", "triggerTimestamp": 1719999995000, "failureReason": "CHECKPOINT_EXPIRED" } } ] ```
    ================================================ FILE: docs/zh/engines/zeta/security.md ================================================ --- sidebar_position: 16 --- # Security ## Basic 认证 您可以通过开启 Basic 认证来保护您的 Web UI。这将要求用户在访问 Web 界面时输入用户名和密码。 | 参数名称 | 是否必填 | 参数描述 | |--------|---------|--------| | `enable-basic-auth` | 否 | 是否开启Basic 认证,默认为 `false` | | `basic-auth-username` | 否 | Basic 认证的用户名,默认为 `admin` | | `basic-auth-password` | 否 | Basic 认证的密码,默认为 `admin` | ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-basic-auth: true basic-auth-username: "your_username" basic-auth-password: "your_password" ``` ## HTTPS 配置 您可以通过开启 HTTPS 来保护您的 API 服务。HTTP 和 HTTPS 可同时开启,也可以只开启其中一个。 | 参数名称 | 是否必填 | 参数描述 | |--------|---------|--------| | `enable-http` | 否 | 是否开启 HTTP 服务,默认为 `true` | | `port` | 否 | HTTP 服务端口,默认为 `8080` | | `enable-https` | 否 | 是否开启 HTTPS 服务,默认为 `false` | | `https-port` | 否 | HTTPS 服务端口,默认为 `8443` | | `key-store-path` | 当 `enable-https` 为 `true` 时必填 | KeyStore 文件路径,用于存储服务器私钥和证书 | | `key-store-password` | 当 `enable-https` 为 `true` 时必填 | KeyStore 密码 | | `key-manager-password` | 当 `enable-https` 为 `true` 时必填 | KeyManager 密码,通常与 KeyStore 密码相同 | | `trust-store-path` | 否 | TrustStore 文件路径,用于验证客户端证书 | | `trust-store-password` | 否 | TrustStore 密码 | **注意**:当 `trust-store-path` 和 `trust-store-password` 配置项不为空时,将启用双向 SSL 认证(客户端认证),要求客户端提供有效证书。 ```yaml seatunnel: engine: http: enable-http: true port: 8080 enable-https: true https-port: 8443 key-store-path: "${YOUR_KEY_STORE_PATH}" key-store-password: "${YOUR_KEY_STORE_PASSWORD}" key-manager-password: "${YOUR_KEY_MANAGER_PASSWORD}" # 可选:双向认证 trust-store-path: "${YOUR_TRUST_STORE_PATH}" trust-store-password: "${YOUR_TRUST_STORE_PASSWORD}" ``` ### 生成密钥样例 ```shell #!/bin/bash # 定义项目根目录 PROJECT_DIR="/Users/mac/IdeaProjects/data" # 定义密码 SERVER_KEYSTORE_PASSWORD="server_keystore_password" SERVER_KEY_PASSWORD="server_keystore_password" CLIENT_KEYSTORE_PASSWORD="client_keystore_password" CLIENT_KEY_PASSWORD="client_keystore_password" SERVER_TRUSTSTORE_PASSWORD="server_truststore_password" CLIENT_TRUSTSTORE_PASSWORD="client_truststore_password" # 生成服务端密钥库 keytool -genkeypair \ -alias server \ -keyalg RSA \ -keysize 2048 \ -validity 365 \ -keystore "$PROJECT_DIR/server_keystore.jks" \ -storepass "$SERVER_KEYSTORE_PASSWORD" \ -keypass "$SERVER_KEY_PASSWORD" \ -dname "CN=localhost,OU=IT,O=MyCompany,L=Shanghai,ST=Shanghai,C=CN" # 导出服务端证书 keytool -exportcert \ -alias server \ -keystore "$PROJECT_DIR/server_keystore.jks" \ -storepass "$SERVER_KEYSTORE_PASSWORD" \ -file "$PROJECT_DIR/server.crt" # 生成客户端密钥库 keytool -genkeypair \ -alias client \ -keyalg RSA \ -keysize 2048 \ -validity 365 \ -keystore "$PROJECT_DIR/client_keystore.jks" \ -storepass "$CLIENT_KEYSTORE_PASSWORD" \ -keypass "$CLIENT_KEY_PASSWORD" \ -dname "CN=client,OU=IT,O=MyCompany,L=Shanghai,ST=Shanghai,C=CN" # 导出客户端证书 keytool -exportcert \ -alias client \ -keystore "$PROJECT_DIR/client_keystore.jks" \ -storepass "$CLIENT_KEYSTORE_PASSWORD" \ -file "$PROJECT_DIR/client.crt" # 创建服务端信任库并导入客户端证书 keytool -importcert \ -alias client \ -file "$PROJECT_DIR/client.crt" \ -keystore "$PROJECT_DIR/server_truststore.jks" \ -storepass "$SERVER_TRUSTSTORE_PASSWORD" \ -noprompt # 创建客户端信任库并导入服务端证书 keytool -importcert \ -alias server \ -file "$PROJECT_DIR/server.crt" \ -keystore "$PROJECT_DIR/client_truststore.jks" \ -storepass "$CLIENT_TRUSTSTORE_PASSWORD" \ -noprompt ``` ================================================ FILE: docs/zh/engines/zeta/separated-cluster-deployment.md ================================================ --- sidebar_position: 6 --- # 部署 SeaTunnel Engine 分离模式集群 SeaTunnel Engine 的Master服务和Worker服务分离,每个服务单独一个进程。Master节点只负责作业调度,RESTful API,任务提交等,Imap数据只存储在Master节点中。Worker节点只负责任务的执行,不参与选举成为master,也不存储Imap数据。 在所有Master节点中,同一时间只有一个Master节点工作,其他Master节点处于standby状态。当当前Master节点宕机或心跳超时,会从其它Master节点中选举出一个新的Master Active节点。 这是最推荐的一种使用方式,在该模式下Master的负载会很小,Master有更多的资源用来进行作业的调度,任务的容错指标监控以及提供rest api服务等,会有更高的稳定性。同时Worker节点不存储Imap的数据,所有的Imap数据都存储在Master节点中,即使Worker节点负载高或者挂掉,也不会导致Imap数据重新分布。 ## 1. 下载 [下载和制作SeaTunnel安装包](download-seatunnel.md) ## 2 配置 SEATUNNEL_HOME 您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ## 3. 配置 Master 节点 JVM 选项 Master节点的JVM参数在`$SEATUNNEL_HOME/config/jvm_master_options`文件中配置。 ```shell # JVM Heap -Xms2g -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ``` Worker节点的JVM参数在`$SEATUNNEL_HOME/config/jvm_worker_options`文件中配置。 ```shell # JVM Heap -Xms2g -Xmx2g # JVM Dump -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/seatunnel/dump/zeta-server # Metaspace -XX:MaxMetaspaceSize=2g # G1GC -XX:+UseG1GC ``` ## 4. 配置 SeaTunnel Engine SeaTunnel Engine 提供许多功能,需要在 `seatunnel.yaml` 中进行配置。. ### 4.1 Imap中数据的备份数设置(该参数在Worker节点无效) SeaTunnel Engine 基于 [Hazelcast IMDG](https://docs.hazelcast.com/imdg/4.1/) 实现集群管理。集群的状态数据(作业运行状态、资源状态)存储在 [Hazelcast IMap](https://docs.hazelcast.com/imdg/4.1/data-structures/map)。 存储在 Hazelcast IMap 中的数据将在集群的所有节点上分布和存储。Hazelcast 会分区存储在 Imap 中的数据。每个分区可以指定备份数量。 因此,SeaTunnel Engine 可以实现集群 HA,无需使用其他服务(例如 zookeeper)。 `backup count` 是定义同步备份数量的参数。例如,如果设置为 1,则分区的备份将放置在一个其他成员上。如果设置为 2,则将放置在两个其他成员上。 我们建议 `backup-count` 的值为 `max(1, min(5, N/2))`。 `N` 是集群节点的数量。 ```yaml seatunnel: engine: backup-count: 1 # 其他配置 ``` :::tip 由于在分离集群模式下,Worker节点不存储Imap数据,因此Worker节点的`backup-count`配置无效。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Worker节点服务会忽略`backup-count`配置。 ::: ### 4.2 Slot配置(该参数在Master节点无效) Slot数量决定了集群节点可以并行运行的任务组数量。一个任务需要的Slot的个数公式为 N = 2 + P(任务配置的并行度)。 默认情况下SeaTunnel Engine的slot个数为动态,即不限制个数。 我们建议slot的个数设置为节点CPU核心数的2倍, 这也是当 `dynamic-slot` 设置为 false 且未设置 `slot-num` 时的默认值。 动态slot个数(默认)配置如下: ```yaml seatunnel: engine: slot-service: dynamic-slot: true # 其他配置 ``` 静态slot个数配置如下: ```yaml seatunnel: engine: slot-service: dynamic-slot: false slot-num: 20 ``` :::tip 由于在分离集群模式下,Master节点不运行任务,所以Master服务不会启动Slot服务,因此Master节点的`slot-service`配置无效。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Master节点服务会忽略`slot-service`配置。 ::: ### 4.3 检查点管理器(该参数在Worker节点无效) 与 Flink 一样,SeaTunnel Engine 支持 Chandy–Lamport 算法。因此,可以实现无数据丢失和重复的数据同步。 **interval** 两个检查点之间的间隔,单位是毫秒。如果在作业配置文件的 `env` 中配置了 `checkpoint.interval` 参数,将以作业配置文件中设置的为准。 **timeout** 检查点的超时时间。如果在超时时间内无法完成检查点,则会触发检查点失败,作业失败。如果在作业的配置文件的`env`中配置了`checkpoint.timeout`参数,将以作业配置文件中设置的为准。 **min-pause** 连续检查点之间的最小暂停时间(以毫秒为单位),确保检查点不会频繁触发。 示例 ```yaml seatunnel: engine: backup-count: 1 print-execution-info-interval: 10 slot-service: dynamic-slot: true checkpoint: interval: 300000 timeout: 10000 min-pause: 5000 ``` **checkpoint storage** 检查点是一种容错恢复机制。这种机制确保程序在运行时,即使突然遇到异常,也能自行恢复。检查点定时触发,每次检查点进行时每个Task都会被要求将自身的状态信息(比如读取kafka时读取到了哪个offset)上报给检查点线程,由该线程写入一个分布式存储(或共享存储)。当任务失败然后自动容错恢复时,或者通过seatunnel.sh -r 指令恢复之前被暂停的任务时,会从检查点存储中加载对应作业的状态信息,并基于这些状态信息进行作业的恢复。 如果集群的节点大于1,检查点存储必须是一个分布式存储,或者共享存储,这样才能保证任意节点挂掉后依然可以在另一个节点加载到存储中的任务状态信息。 :::tip 检查点配置只有Master服务才会读取,Worker服务不会读取检查点配置。如果Master和Worker进程在同一个机器上启动,Master和Worker会共用`seatunnel.yaml`配置文件,此时Worker节点服务会忽略`checkpoint`配置。 ::: 有关检查点存储的信息,您可以查看 [Checkpoint Storage](checkpoint-storage.md) ### 4.4 历史作业过期配置 每个完成的作业的信息,如状态、计数器和错误日志,都存储在 IMap 对象中。随着运行作业数量的增加,内存会增加,最终内存将溢出。因此,您可以调整 `history-job-expire-minutes` 参数来解决这个问题。此参数的时间单位是分钟。默认值是 1440 分钟,即一天。 示例 ```yaml seatunnel: engine: history-job-expire-minutes: 1440 ``` ### 4.5 类加载器缓存模式 此配置主要解决不断创建和尝试销毁类加载器所导致的资源泄漏问题。 如果您遇到与metaspace空间溢出相关的异常,您可以尝试启用此配置。 为了减少创建类加载器的频率,在启用此配置后,SeaTunnel 在作业完成时不会尝试释放相应的类加载器,以便它可以被后续作业使用,也就是说,当运行作业中使用的 Source/Sink 连接器类型不是太多时,它更有效。 默认值是 true。 示例 ```yaml seatunnel: engine: classloader-cache-mode: true ``` ### 4.6 IMap持久化配置(该参数在Worker节点无效) :::tip 由于在分离集群模式下,只有Master节点存储Imap数据,Worker节点不存储Imap数据,所以Worker服务不会读取该参数项。 ::: 在SeaTunnel中,我们使用IMap(一种分布式的Map,可以实现数据跨节点跨进程的写入的读取 有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map)) 来存储每个任务及其task的状态,以便在任务所在节点宕机后,可以在其他节点上获取到任务之前的状态信息,从而恢复任务实现任务的容错。 默认情况下Imap的信息只是存储在内存中,我们可以设置Imap数据的复本数,具体可参考(4.1 Imap中数据的备份数设置),如果复本数是2,代表每个数据会同时存储在2个不同的节点中。一旦节点宕机,Imap中的数据会重新在其它节点上自动补充到设置的复本数。但是当所有节点都被停止后,Imap中的数据会丢失。当集群节点再次启动后,所有之前正在运行的任务都会被标记为失败,需要用户手工通过seatunnel.sh -r 指令恢复运行。 为了解决这个问题,我们可以将Imap中的数据持久化到外部存储中,如HDFS、OSS等。这样即使所有节点都被停止,Imap中的数据也不会丢失,当集群节点再次启动后,所有之前正在运行的任务都会被自动恢复。 下面介绍如何使用 MapStore 持久化配置。有关详细信息,请参阅 [Hazelcast Map](https://docs.hazelcast.com/imdg/4.2/data-structures/map) **type** imap 持久化的类型,目前仅支持 `hdfs`。 **namespace** 它用于区分不同业务的数据存储位置,如 OSS 存储桶名称。 **clusterName** 此参数主要用于集群隔离, 我们可以使用它来区分不同的集群,如 cluster1、cluster2,这也用于区分不同的业务。 **fs.defaultFS** 我们使用 hdfs api 读写文件,因此使用此存储需要提供 hdfs 配置。 如果您使用 HDFS,可以像这样配置: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: hdfs://localhost:9000 ``` 如果没有 HDFS,并且您的集群只有一个节点,您可以像这样配置使用本地文件: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: hdfs fs.defaultFS: file:/// ``` 如果您使用 OSS,可以像这样配置: ```yaml map: engine*: map-store: enabled: true initial-mode: EAGER factory-class-name: org.apache.seatunnel.engine.server.persistence.FileMapStoreFactory properties: type: hdfs namespace: /tmp/seatunnel/imap clusterName: seatunnel-cluster storage.type: oss block.size: block size(bytes) oss.bucket: oss://bucket name/ fs.oss.accessKeyId: OSS access key id fs.oss.accessKeySecret: OSS access key secret fs.oss.endpoint: OSS endpoint ``` 注意:使用OSS 时,确保 lib目录下有这几个jar. ``` aliyun-sdk-oss-3.13.2.jar hadoop-aliyun-3.3.6.jar jdom2-2.0.6.jar netty-buffer-4.1.89.Final.jar netty-common-4.1.89.Final.jar seatunnel-hadoop3-3.1.4-uber.jar ``` ### 4.7 作业调度策略 当资源不足时,作业调度策略可以配置为以下两种模式: 1. `WAIT`:等待资源可用。 2. `REJECT`:拒绝作业,默认值。 示例 ```yaml seatunnel: engine: job-schedule-strategy: WAIT ``` 当`dynamic-slot: ture`时,`job-schedule-strategy: WAIT` 配置会失效,将被强制修改为`job-schedule-strategy: REJECT`,因为动态Slot时该参数没有意义,可以直接提交。 ### 4.8 Coordinator Service CoordinatorService 提供了每个作业从 LogicalDag 到 ExecutionDag,再到 PhysicalDag 的生成流程, 并最终创建作业的 JobMaster 进行作业的调度执行和状态监控 **core-thread-num** 配置 CoordinatorService 线程池核心线程数量 **max-thread-num** 同时可执行的最大作业数量 Example ```yaml coordinator-service: core-thread-num: 30 max-thread-num: 1000 ``` ### 4.9 作业指标分区数量(此参数在 Worker 节点上无效) 新的配置选项 JOB_METRICS_PARTITION_COUNT 用于控制在 Hazelcast IMap 中存储运行作业指标时所使用的分区数量。 - 默认值: 1(单个 key,向后兼容) - 用法: 增加该值可以将指标分布到多个分区中,从而在大量任务同时更新指标时减少竞争。 示例: ```yaml seatunnel: engine: job-metrics-partition-count: 4 ``` 上述配置会将指标分布到 4 个分区中,而不是使用单个 key。 当任务数量超过约 20,000 时,增加分区数量可以显著提高性能。 作为实用指导,分区数量约 1,000–2,000 往往在减少锁竞争和最小化开销之间提供最佳平衡。 建议以此值开始,并根据集群规模和工作负载特性进行调整。 注意: 在高并发竞争的情况下,增加分区数量可能会提高并行度;但如果设置过大,会引入额外的分布与合并开销,从而降低整体性能。 分区数量应在作业启动前进行配置。如果在作业已启动后更改,可能导致指标键不匹配,因此建议在修改此选项后重启 SeaTunnel。 ## 5. 配置 SeaTunnel Engine 网络服务 所有 SeaTunnel Engine 网络相关的配置都在 `hazelcast-master.yaml`和`hazelcast-worker.yaml` 文件中. ### 5.1 集群名称 SeaTunnel Engine 节点使用 `cluster-name` 来确定另一个节点是否与自己在同一集群中。如果两个节点之间的集群名称不同,SeaTunnel 引擎将拒绝服务请求。 ### 5.2 网络 基于 [Hazelcast](https://docs.hazelcast.com/imdg/4.1/clusters/discovery-mechanisms), 一个 SeaTunnel Engine 集群是由运行 SeaTunnel Engine 服务器的集群成员组成的网络。 集群成员自动加入一起形成集群。这种自动加入是通过集群成员使用的各种发现机制来相互发现的。 请注意,集群形成后,集群成员之间的通信始终通过 TCP/IP 进行,无论使用的发现机制如何。 SeaTunnel Engine 使用以下发现机制。 #### TCP 您可以将 SeaTunnel Engine 配置为完整的 TCP/IP 集群。有关配置详细信息,请参阅 [Discovering Members by TCP section](tcp.md)。 在分离集群模式下,Master和Worker服务使用不同的端口。 Master节点网络配置 `hazelcast-master.yaml` ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - master-node-1:5801 - master-node-2:5801 - worker-node-1:5802 - worker-node-2:5802 port: auto-increment: false port: 5801 properties: hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ``` Worker节点网络配置 `hazelcast-worker.yaml` ```yaml hazelcast: cluster-name: seatunnel network: join: tcp-ip: enabled: true member-list: - master-node-1:5801 - master-node-2:5801 - worker-node-1:5802 - worker-node-2:5802 port: auto-increment: false port: 5802 properties: hazelcast.heartbeat.failuredetector.type: phi-accrual hazelcast.heartbeat.interval.seconds: 2 hazelcast.max.no.heartbeat.seconds: 180 hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10 hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200 hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100 ``` TCP 是我们建议在独立 SeaTunnel Engine 集群中使用的方式。 另一方面,Hazelcast 提供了一些其他的服务发现方法。有关详细信息,请参阅 [Hazelcast Network](https://docs.hazelcast.com/imdg/4.1/clusters/setting-up-clusters) ## 6. 启动 SeaTunnel Engine Master 节点 可以通过守护进程使用 `-d` 参数启动。 ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d -r master ``` 日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-master.log` ## 7. 启动 SeaTunnel Engine Worker 节点 可以通过守护进程使用 `-d` 参数启动。 ```shell mkdir -p $SEATUNNEL_HOME/logs ./bin/seatunnel-cluster.sh -d -r worker ``` 日志将写入 `$SEATUNNEL_HOME/logs/seatunnel-engine-worker.log` ## 8. 安装 SeaTunnel Engine 客户端 ### 8.1 和服务端一样设置`SEATUNNEL_HOME` 您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ## 8. 提交作业和管理作业 ### 8.1 使用 SeaTunnel Engine 客户端提交作业 #### 安装 SeaTunnel Engine 客户端 ##### 设置和服务器一样的`SEATUNNEL_HOME` 您可以通过添加 `/etc/profile.d/seatunnel.sh` 文件来配置 `SEATUNNEL_HOME` 。`/etc/profile.d/seatunnel.sh` 的内容如下: ``` export SEATUNNEL_HOME=${seatunnel install path} export PATH=$PATH:$SEATUNNEL_HOME/bin ``` ##### 配置 SeaTunnel Engine 客户端 所有 SeaTunnel Engine 客户端的配置都在 `hazelcast-client.yaml` 里。 **cluster-name** 客户端必须与 SeaTunnel Engine 具有相同的 `cluster-name`。否则,SeaTunnel Engine 将拒绝客户端的请求。 **network** 需要将所有 SeaTunnel Engine Master节点的地址添加到这里。 ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - master-node-1:5801 - master-node-2:5801 ``` #### 提交作业和管理作业 现在集群部署完成了,您可以通过以下教程完成作业的提交和管理:[提交和管理作业](user-command.md) ### 8.2 使用 REST API 提交作业 SeaTunnel Engine 提供了 REST API 用于提交作业。有关详细信息,请参阅 [REST API V2](rest-api-v2.md) ================================================ FILE: docs/zh/engines/zeta/slot-allocation-strategy.md ================================================ --- sidebar_position: 15 --- # Slot分配策略 Slot分配策略是SeaTunnel Engine的一个重要组成部分,它决定了SeaTunnel Engine如何将任务分配到不同的Slot上。Slot分配策略是一个可配置的组件,用户可以根据自己的需求来配置Slot分配策略。 **配置方法:** 设置参数`slot-allocation-strategy`, 可选值有`RANDOM`, `SYSTEM_LOAD`, `SLOT_RATIO`。 例: ```yaml seatunnel: engine: slot-service: slot-allocation-strategy: RANDOM ... ``` ## RANDOM(默认值) 随机分配策略是SeaTunnel Engine默认的Slot分配策略,它将任务随机分配到不同的Slot上。 ## SYSTEM_LOAD 系统负载策略是根据系统的负载情况来分配Slot的策略,它会根据系统的负载情况来动态调整Slot的分配。 ### 1. **时间权重的设计** 时间权重体现了时间对调度优先级的影响: - 最近的数据赋予较高权重,历史数据逐渐衰减。 - 采用分布 $4, 2, 2, 1, 1$ 并归一化后,每次统计的时间权重为: $$ \text{时间权重比例} = \frac{\text{当前权重}}{10} $$ > 当集群刚启动时,数据不足5条,会单独做归一化,这里计算公式会动态调整,不做赘述。 ### 2. **资源利用率计算** 将 CPU 和内存资源的空闲率按照权重进行综合评估: $$ \text{资源空闲率} = \frac{(1 - \text{CPU利用率}) \cdot \text{CPU权重} + (1 - \text{内存利用率}) \cdot \text{内存权重}}{\text{CPU权重} + \text{内存权重}} $$ - 公式中的 $(1 - \text{CPU利用率})$ 和 $(1 - \text{内存利用率})$ 是空闲率。 - CPU 和内存的权重可根据具体需求调整(如 $0.6$ 和 $0.4$),灵活适应不同场景。 ### 3. **时间衰减与调度优先级公式** 引入时间权重衰减后,计算调度优先级的公式为: $$ \text{综合资源空闲率} = \sum_{i=1}^{5} \left( \frac{(1 - \text{CPU利用率}_i) \cdot \text{CPU权重} + (1 - \text{内存利用率}_i) \cdot \text{内存权重}}{\text{CPU权重} + \text{内存权重}} \cdot \text{时间权重}_i \right) $$ ### 4. **Slot分配的资源空闲率动态调整** 分配多个 Slot 时,考虑到资源状态的实时更新和动态模拟(因为同一任务资源快速分配负载是不会变化): - **每个 Slot 使用的资源比** = (1-综合资源空闲率) ÷ 已分配的 Slot 数量 - 分配 Slot 后更新对应节点的空闲率: $$ \text{Slot分配后的空闲率} = \text{综合资源空闲率} - \text{每 Slot 使用的资源比} $$ - 默认单个Slot使用10%资源(首次启动无法得知但Slot占用资源,这里默认设置为10%,不设置太低的原因是防止分配过多资源导致该节点负载太高。等下一次监控信息捕获到就会相对准确。) 这种方法属于与计算使得调度更加贴合实际资源使用情况。 ### 5. **平衡因子引入** 只通过Slot动态调整更新资源空闲率可能也会存在误差,我们引入基于Slot数量的平衡因子,衡量节点当前负载状态,避免调度资源分配过于集中: > 该数量可以实时统计到,用来优化调度优先级指标 $$ \text{BalanceFactor}_i = 1 - \frac{S_{\text{used},i}}{S_{\text{total},i}} $$ - $S_{\text{used},i}$:节点 $i$ 已分配的 Slot 数。 - $S_{\text{total},i}$:节点 $i$ 的 Slot 总数。 通过平衡因子调整调度优先级: $$ W_i = \alpha \cdot \text{Slot分配后的空闲率}_i + \beta \cdot \text{BalanceFactor}_i $$ **参数意义**: - $\alpha$:侧重资源利用率的权重:0.7 - $\beta$:平衡因子的权重,防止单点过载。:0.3 ### 6. **动态调整逻辑** - 定时采集 CPU 和内存利用率,维持最近 5 次的统计数据。 - 同一任务动态更新权重,对旧数据逐步衰减。 - 根据Slot使用,动态平衡。 > 说明: > 比如我们有两个节点,需要申请10个Slot,A有10个空闲Slot,B有20个空闲Slot,当通过第四步、第五步计算后,10个Slot的权重计算,A节点权重都比B节点权重高。 > 那么我们仍认为A节点应该分配资源,此时可能是因为集群B节点Slot数量配置不是最佳导致(Worker节点Slot配置少了)。 ## SLOT_RATIO Slot比例策略是根据Slot的使用率来进行调度,使用率越低的Slot优先级越高。 **计算逻辑**: 1. 获取Worker总Slot数 2. 获取未分配Slot数。 3. 使用率 = (总插槽数 - 未分配插槽数) / 总插槽数。 ================================================ FILE: docs/zh/engines/zeta/tcp.md ================================================ --- sidebar_position: 10 --- # TCP NetWork 如果您的环境中多播不是首选的发现方式,那么您可以将 SeaTunnel 引擎配置为一个完整的 TCP/IP 集群。当您通过 TCP/IP 配置 SeaTunnel 引擎以发现成员时,您必须将所有或一部分成员的主机名和/或 IP 地址列为集群成员。您不必列出所有这些集群成员,但在新成员加入时,至少有一个列出的成员必须是活跃的。 要配置您的 Hazelcast 作为一个完整的 TCP/IP 集群,请设置以下配置元素。有关 TCP/IP 发现配置元素的完整描述,请参见 tcp-ip 元素部分。 - 将 tcp-ip 元素的 enabled 属性设置为 true。 - 在 tcp-ip 元素内提供您的成员元素。 以下是一个示例声明性配置。 ```yaml hazelcast: network: join: tcp-ip: enabled: true member-list: - machine1 - machine2 - machine3:5799 - 192.168.1.0-7 - 192.168.1.21 ``` 如上所示,您可以为成员元素提供 IP 地址或主机名。您还可以提供一个 IP 地址范围,例如 `192.168.1.0-7`. 除了像上面展示的那样逐行提供成员外,您还可以选择使用 members 元素并写入逗号分隔的 IP 地址,如下所示。 `192.168.1.0-7,192.168.1.21` 如果您没有为成员提供端口,Hazelcast 会自动尝试端口 `5701`, `5702` 等。 ================================================ FILE: docs/zh/engines/zeta/telemetry.md ================================================ --- sidebar_position: 14 --- # Telemetry 通过 `Prometheus-exports` 集成 `Metrices` 可以更好地与相关的监控平台(如 Prometheus 和 Grafana)无缝衔接,提高对 SeaTunnel 集群的监控和告警能力。 您可以在 `seatunnel.yaml` 文件中配置监控的相关设置。 以下是一个声明式配置的示例。 ```yaml seatunnel: engine: telemetry: metric: enabled: true ``` ## 指标 Prometheus 的[指标文本](telemetryetrics.txt),获取方式为 `http://{instanceHost}:5801/hazelcast/rest/instance/metrics`。 OpenMetrics 的[指标文本](telemetrypenmetrics.txt) ,获取方式为 `http://{instanceHost}:5801/hazelcast/rest/instance/openmetrics`。 可用的指标包括以下类别。 注意:所有指标都有相同的标签名 `cluster`,其值为 `hazelcast.cluster-name` 的配置。 ### 节点指标 | MetricName | Type | Labels | 描述 | |-------------------------------------------|-------|------------------------------------------------------------------------------------------------------------|-------------------------------------| | cluster_info | Gauge | **hazelcastVersion**,hazelcast 的版本。**master**,seatunnel 主地址。 | 集群信息 | | cluster_time | Gauge | **hazelcastVersion**,hazelcast 的版本。 | 集群时间 | | node_count | Gauge | - | 集群节点总数 | | node_state | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 节点是否正常 | | hazelcast_executor_executedCount | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器执行次数 | | hazelcast_executor_isShutdown | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器是否关闭 | | hazelcast_executor_isTerminated | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器是否终止 | | hazelcast_executor_maxPoolSize | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器最大池大小 | | hazelcast_executor_poolSize | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器当前池大小 | | hazelcast_executor_queueRemainingCapacity | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器剩余队列容量 | | hazelcast_executor_queueSize | Gauge | **type**,执行器的类型,包括:"async" "client" "clientBlocking" "clientQuery" "io" "offloadable" "scheduled" "system" | seatunnel 集群节点的 hazelcast 执行器当前队列大小 | | hazelcast_partition_partitionCount | Gauge | - | seatunnel 集群节点的分区数量 | | hazelcast_partition_activePartition | Gauge | - | seatunnel 集群节点的活跃分区数量 | | hazelcast_partition_isClusterSafe | Gauge | - | 分区是否安全 | | hazelcast_partition_isLocalMemberSafe | Gauge | - | 本地成员是否安全 | ### 线程池状态 | MetricName | Type | Labels | 描述 | |-------------------------------------|---------|-----------------------------------------|--------------------------------| | job_thread_pool_activeCount | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的活动线程数 | | job_thread_pool_corePoolSize | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的核心池大小 | | job_thread_pool_maximumPoolSize | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的最大池大小 | | job_thread_pool_poolSize | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的当前池大小 | | job_thread_pool_queueTaskCount | Gauge | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的队列任务数 | | job_thread_pool_completedTask_total | Counter | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的完成任务数 | | job_thread_pool_task_total | Counter | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的总任务数 | | job_thread_pool_rejection_total | Counter | **address**,服务器实例地址,例如:"127.0.0.1:5801" | seatunnel 协调器作业执行器缓存线程池的拒绝任务总数 | ### 作业信息详细 | MetricName | Type | Labels | 描述 | |------------|-------|---------------------------------------------------------------------------------------------------------|---------------------| | job_count | Gauge | **type**,作业的类型,包括:"canceled" "cancelling" "created" "failed" "failing" "finished" "running" "scheduled" | seatunnel 集群的所有作业计数 | ### JVM 指标 | MetricName | Type | Labels | 描述 | |--------------------------------------------|---------|---------------------------------------------------------------------------------------------------------------|----------------------------------------| | jvm_threads_current | Gauge | - | JVM 的当前线程数 | | jvm_threads_daemon | Gauge | - | JVM 的守护线程数 | | jvm_threads_peak | Gauge | - | JVM 的峰值线程数 | | jvm_threads_started_total | Counter | - | JVM 启动的线程总数 | | jvm_threads_deadlocked | Gauge | - | JVM 线程在等待获取对象监视器或拥有的可拥有同步器时处于死锁状态的周期数 | | jvm_threads_deadlocked_monitor | Gauge | - | JVM 线程在等待获取对象监视器时处于死锁状态的周期数 | | jvm_threads_state | Gauge | **state**,JVM 线程的状态,包括:"NEW" "TERMINATED" "RUNNABLE" "BLOCKED" "WAITING" "TIMED_WAITING" "UNKNOWN" | 按状态分类的线程当前计数 | | jvm_classes_currently_loaded | Gauge | - | JVM 中当前加载的类的数量 | | jvm_classes_loaded_total | Counter | - | 自 JVM 开始执行以来加载的类的总数 | | jvm_classes_unloaded_total | Counter | - | 自 JVM 开始执行以来卸载的类的总数 | | jvm_memory_pool_allocated_bytes_total | Counter | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Gen" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 在给定 JVM 内存池中分配的总字节数。仅在垃圾收集后更新,而不是持续更新。 | | jvm_gc_collection_seconds_count | Summary | **gc**,包括:"PS Scavenge" "PS MarkSweep" | 在给定 JVM 垃圾收集器中花费的时间(以秒为单位) | | jvm_gc_collection_seconds_sum | Summary | **gc**,包括:"PS Scavenge" "PS MarkSweep" | 在给定 JVM 垃圾收集器中花费的时间(以秒为单位) | jvm_info | Gauge | **runtime**,例如:“Java(TM) SE Runtime Environment”。**供应商**,例如:“Oracle Corporation”。**版本**,例如:“1.8.0_212-b10” | VM 版本信息 | | process_cpu_seconds_total | Counter | - | 用户和系统 CPU 时间总计,以秒为单位 | | process_start_time_seconds | Gauge | - | 进程自 Unix 纪元以来的启动时间,以秒为单位 | | process_open_fds | Gauge | - | 打开的文件描述符数量 | | process_max_fds | Gauge | - | 最大打开的文件描述符数量 | | jvm_memory_objects_pending_finalization | Gauge | - | 等待最终化队列中的对象数量 | | jvm_memory_bytes_used | Gauge | **area**,包括: "heap" "noheap" | 给定 JVM 内存区域使用的字节数 | | jvm_memory_bytes_committed | Gauge | **area**,包括: "heap" "noheap" | 给定 JVM 内存区域的提交字节数 | | jvm_memory_bytes_max | Gauge | **area**,包括: "heap" "noheap" | 给定 JVM 内存区域的最大字节数 | | jvm_memory_bytes_init | Gauge | **area**,包括: "heap" "noheap" | 给定 JVM 内存区域的初始字节数 | | jvm_memory_pool_bytes_used | Gauge | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 给定 JVM 内存池使用的字节数 | | jvm_memory_pool_bytes_committed | Gauge | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 给定 JVM 内存池的提交字节数 | | jvm_memory_pool_bytes_max | Gauge | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 给定 JVM 内存池的最大字节数 | | jvm_memory_pool_bytes_init | Gauge | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 给定 JVM 内存池的初始字节数 | | jvm_memory_pool_allocated_bytes_created | Gauge | **pool**,包括:"Code Cache" "PS Eden Space" "PS Old Ge" "PS Survivor Space" "Compressed Class Space" "Metaspace" | 给定 JVM 内存池中创建的总字节数。仅在 GC 后更新,而不是持续更新 | | jvm_memory_pool_collection_used_bytes | Gauge | **pool**,包括:"PS Eden Space" "PS Old Ge" "PS Survivor Space" | 给定 JVM 内存池在最后一次回收后的使用字节数 | | jvm_memory_pool_collection_committed_bytes | Gauge | **pool**,包括:"PS Eden Space" "PS Old Ge" "PS Survivor Space" | 给定 JVM 内存池在最后一次回收后的提交字节数 | | jvm_memory_pool_collection_max_bytes | Gauge | **pool**,包括:"PS Eden Space" "PS Old Ge" "PS Survivor Space" | 给定 JVM 内存池在最后一次回收后的最大字节数 | | jvm_memory_pool_collection_init_bytes | Gauge | **pool**,包括:"PS Eden Space" "PS Old Ge" "PS Survivor Space" | 给定 JVM 内存池在最后一次回收后的初始字节数 | | jvm_buffer_pool_used_bytes | Gauge | **pool**,包括:"direct" "mapped" | 给定 JVM 缓冲池使用的字节数 | | jvm_buffer_pool_capacity_bytes | Gauge | **pool**,包括:"direct" "mapped" | 给定 JVM 缓冲池的字节容量 | | jvm_buffer_pool_used_buffers | Gauge | **pool**,包括:"direct" "mapped" | 给定 JVM 缓冲池使用的缓冲区 | ## 通过 Prometheus 和 Grafana 进行集群监控 ### 安装 Prometheus 有关如何设置 Prometheus 服务器的指南,请访问 [安装](https://prometheus.io/docs/prometheus/latest/installation) ### 配置 Prometheus 将 SeaTunnel 实例指标导出添加到 `/etc/prometheus/prometheus.yaml` 中。例如: ```yaml global: # 从此作业中抓取目标的频率。 scrape_interval: 15s scrape_configs: # 默认分配给抓取指标的作业名称。 - job_name: 'seatunnel' scrape_interval: 5s # 指标导出路径 metrics_path: /hazelcast/rest/instance/metrics # 此作业静态配置的目标列表。 static_configs: # 静态配置中指定的目标。 - targets: [ 'localhost:5801' ] # 为从目标抓取的所有指标分配的标签。 # labels: [:] ``` ### 安装 Grafana 有关如何设置 Grafana 服务器的指南,请访问 [安装](https://grafana.com/docs/grafana/latest/setup-grafana/installation) #### 监控仪表板 - 在 Grafana 中添加 Prometheus 数据源。 - 将 `Seatunnel Cluster` 监控仪表板导入到 Grafana 中,使用 [仪表板 JSON](telemetryrafana-dashboard.json)。 监控[效果图](../../../images/grafana.png) ================================================ FILE: docs/zh/engines/zeta/tuning-guide.md ================================================ --- sidebar_position: 15 --- # 调优指南 本文为大家介绍 SeaTunnel Engine 的调优方法,帮助用户根据实际需求优化 SeaTunnel Engine 的性能和稳定性。 阅读次篇前请知晓,当前指南结合的是大部分用户的真实使用情况总结而成,可能并不适用于所有场景,用户可以根据实际情况进行调整。 SeaTunnel Engine 是基于 [JVM] (https://zh.wikipedia.org/wiki/Java%E8%99%9A%E6%8B%9F%E6%9C%BA) 运行的数据集成引擎,所以 JVM 部分的调优对 SeaTunnel Engine 同样适用,这里就不再赘述。 ## 集群响应缓慢或假死 ### JVM 如果 SeaTunnel Engine 集群响应缓慢或假死,可能是由于 JVM 堆内存不足导致的。可以通过以下步骤进行排查: #### 堆内存不足 ##### 排查流程 1. 检查 JVM 堆内存实时占用 使用 `jcmd` 命令查看 JVM 堆内存使用情况, 其中 `` 是 SeaTunnel Engine 进程的 PID。 ```bash jmap -heap ``` 输出结果示例: ```shell Attaching to process ID 2111950, please wait... Debugger attached successfully. Server compiler detected. JVM version is 25.192-b12 using thread-local object allocation. Garbage-First (G1) GC with 13 thread(s) Heap Configuration: MinHeapFreeRatio = 40 MaxHeapFreeRatio = 70 MaxHeapSize = 17179869184 (16384.0MB) NewSize = 1363144 (1.2999954223632812MB) MaxNewSize = 10301210624 (9824.0MB) OldSize = 5452592 (5.1999969482421875MB) NewRatio = 2 SurvivorRatio = 8 MetaspaceSize = 21807104 (20.796875MB) CompressedClassSpaceSize = 1073741824 (1024.0MB) MaxMetaspaceSize = 2147483648 (2048.0MB) G1HeapRegionSize = 8388608 (8.0MB) Heap Usage: G1 Heap: regions = 2048 capacity = 17179869184 (16384.0MB) used = 2997548048 (2858.684585571289MB) free = 14182321136 (13525.315414428711MB) 17.448026034981012% used G1 Young Generation: Eden Space: regions = 348 capacity = 10737418240 (10240.0MB) used = 2919235584 (2784.0MB) free = 7818182656 (7456.0MB) 27.1875% used Survivor Space: regions = 10 capacity = 83886080 (80.0MB) used = 83886080 (80.0MB) free = 0 (0.0MB) 100.0% used G1 Old Generation: regions = 0 capacity = 6358564864 (6064.0MB) used = 0 (0.0MB) free = 6358564864 (6064.0MB) 0.0% used ``` 重点关注G1 Old Generation的使用情况,如果 Old Generation 的使用率接近 100%,则可能是堆内存不足导致的。 2. 检查日志 系统会不定期输出健康监控日志,检查 SeaTunnel Engine 的日志,查看是否有频繁的 Full GC 或者长时间的 GC 暂停,这可能是由于堆内存不足导致的。 下边是一个日志示例: ```log [] 2025-07-04 16:42:54,818 INFO [c.h.i.d.HealthMonitor ] [hz.main.HealthMonitor] - [127.0.0.1]:5801 [seatunnel] [5.1] processors=16, physical.memory.total=31.1G, physical.memory.free=9.7G, swap.space.total=0, swap.space.free=0, heap.memory.used=198.7M, heap.memory.free=15.8G, heap.memory.total=16.0G, heap.memory.max=16.0G, heap.memory.used/total=1.21%, heap.memory.used/max=1.21%, minor.gc.count=2, minor.gc.time=44ms, major.gc.count=0, major.gc.time=0ms, load.process=0.00%, load.system=66.67%, load.systemAverage=5.66, thread.count=118, thread.peakCount=118, cluster.timeDiff=0, event.q.size=0, executor.q.async.size=0, executor.q.client.size=0, executor.q.client.query.size=0, executor.q.client.blocking.size=0, executor.q.query.size=0, executor.q.scheduled.size=0, executor.q.io.size=0, executor.q.system.size=0, executor.q.operations.size=0, executor.q.priorityOperation.size=0, operations.completed.count=13, executor.q.mapLoad.size=0, executor.q.mapLoadAllKeys.size=0, executor.q.cluster.size=0, executor.q.response.size=0, operations.running.count=0, operations.pending.invocations.percentage=0.00%, operations.pending.invocations.count=0, proxy.count=9, clientEndpoint.count=0, connection.active.count=0, client.connection.count=0, connection.count=0 ``` 重点关注: - `heap.memory.used/max`:堆内存使用率,如果接近 100%,则可能是堆内存不足。 - `major.gc.count` 和 `major.gc.time` :如果 Full GC 频繁,可能是堆内存不足导致的。 可以通过持续查看日志来判断是否存在频繁的 Full GC 或者长时间的 GC 暂停。 ##### 解决方案 通过降低任务并发和任务数量来降低同一时间的内存占用。如果确实需要更多的内存,请参考 [安装部署](deployment.md) 中的配置 SeaTunnel Engine JVM 选项来增加内存。 ##### 内存无限制占用 1. 生成内存快照 有些时候,我们的任务量固定,但是内存使用量却不断增加,这可能是由于任务中存在内存泄漏导致的。请dump下对应的内存快照信息。 ```shell jmap -dump:live,format=b,file=heap.hprof ``` 然后使用 [Eclipse Memory Analyzer](https://www.eclipse.org/mat/) 等工具分析内存快照,查找内存泄漏的原因。 针对非二开的用户或者连接器,也可以创建一个 issue 并附上内存快照,我们会帮助您分析。 2. 打印对象占用排行 有些时候,生成内存快照会随着JVM的假死而失败,这时可以尝试打印对象占用排行来查看内存使用情况。 ```shell jmap -histo:live | head -n 100 ``` 同样的,可以通过分析输出结果来查找内存泄漏的原因。 针对非二开的用户或者连接器,也可以创建一个 issue 并附上对象占用信息,我们会帮助您分析。 #### CPU占用率过高 CPU占用率过高也是一个集群节点假死的常见原因,但是出现概率基本没有内存占用过高的情况高。可以通过以下步骤进行排查: ##### 排查流程 1. 检查 CPU 占用率 - 使用 `top` 或 `htop` 命令查看 SeaTunnel Engine 进程的 CPU 占用率。 - 如果 CPU 占用率接近 100%,则可能是 CPU 资源不足导致的。如果有多个核,需要考虑多个核的占用率。 ##### 解决方案 如果 CPU 占用率过高,可以尝试以下解决方案: - 降低任务并发和任务数量,减少 CPU 资源的占用。 - 增加集群节点数量,分担 CPU 资源的压力。 ### Hazelcast Hazelcast相关的配置也是影响 SeaTunnel Engine 性能的重要因素。可以通过修改`hazelcast.yaml`系列文件的配置参数修改,请参考 [安装部署](deployment.md) 。 以下是一些常见的调优参数: - `hazelcast.operation.generic.thread.count`: 该参数控制 Hazelcast 的通用操作线程数。SeaTunnel Engine 使用此线程用于执行RPC请求。可以根据实际情况调整该参数,以提高 Hazelcast RPC 的性能。 如果监控到日志中频繁出现如下类型日志,同时CPU占用率不算很高。请尝试调高该参数: ```log 2024-09-03 06:15:45,807 WARN [.s.i.o.s.SlowOperationDetector] [hz.main.SlowOperationDetectorThread] - [seatunnel-worker-1]:5802 [seatunnel] [5.1] Slow operation detected: ``` ================================================ FILE: docs/zh/engines/zeta/user-command.md ================================================ --- sidebar_position: 13 --- # 客户端命令行工具 SeaTunnel Engine 提供了一个命令行工具,用于管理 SeaTunnel Engine 的作业。您可以使用命令行工具提交、停止、暂停、恢复、删除作业,查看作业状态和监控指标等。 可以通过如下命令获取命令行工具的帮助信息: ```shell bin/seatunnel.sh -h ``` 输出如下: ```shell Usage: seatunnel.sh [options] Options: --async Run the job asynchronously, when the job is submitted, the client will exit (default: false) -can, --cancel, --cancel-job Cancel job(s) by JobId -f, --force-cancel, --force-cancel-job Force Cancel job(s) by jobId --check Whether check config (default: false) -cj, --close, --close-job Close client the task will also be closed (default: true) -cn, --cluster The name of cluster -c, --config Config file --decrypt Decrypt config file, When both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false) -m, --master, -e, --deploy-mode SeaTunnel job submit master, support [local, cluster] (default: cluster) --encrypt Encrypt config file, when both --decrypt and --encrypt are specified, only --encrypt will take effect (default: false) --get_running_job_metrics Gets metrics for running jobs (default: false) -h, --help Show the usage message -j, --job-id Get job status by JobId -l, --list list job status (default: false) --metrics Get job metrics by JobId -n, --name SeaTunnel job name (default: SeaTunnel) -r, --restore, --restore-job restore with savepoint by jobId -s, --savepoint, --savepoint-job savepoint job by jobId -i, --variable Variable substitution, such as -i city=beijing, or -i date=20190318.We use ',' as separator, when inside "", ',' are treated as normal characters instead of delimiters. (default: []) ``` ## 提交作业 ```shell bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template ``` **--async** 参数可以让作业在后台运行,当作业提交后,客户端会退出。 ```shell ./bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async ``` **-n** 或 **--name** 参数可以指定作业的名称 ```shell ./bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template --async -n myjob ``` ## 查看作业列表 ```shell ./bin/seatunnel.sh -l ``` 该命令会输出所有当前集群中的作业列表(包含运行完成的历史作业和正在运行的作业) ## 查看作业状态 ```shell ./bin/seatunnel.sh -j ``` 该命令会输出指定作业的状态信息 ## 获取正在运行的作业监控信息 ```shell ./bin/seatunnel.sh --get_running_job_metrics ``` 该命令会输出正在运行的作业的监控信息 ## 获取指定作业监控信息 --metrics 参数可以获取指定作业的监控信息 ```shell ./bin/seatunnel.sh --metrics ``` ## 暂停作业 ```shell ./bin/seatunnel.sh -s ``` 该命令会暂停指定作业,注意,只有开启了checkpoint的作业才支持暂停作业(实时同步作业默认开启checkpoint,批处理作业默认不开启checkpoint需要通过在 `env` 中配置checkpoint.interval来开启checkpoint)。 暂停作业是以split为最小单位的,即暂停作业后,会等待当前正在运行的split运行完成后再暂停。任务恢复后,会从暂停的split继续运行。 ## 恢复作业 ```shell ./bin/seatunnel.sh -r -c $SEATUNNEL_HOME/config/v2.batch.config.template ``` 该命令会恢复指定作业,注意,只有开启了checkpoint的作业才支持恢复作业(实时同步作业默认开启checkpoint,批处理作业默认不开启checkpoint需要通过在 `env` 中配置checkpoint.interval来开启checkpoint)。 恢复作业需要指定jobId和作业的配置文件。 运行失败的作业和通过seatunnel.sh -s <jobId>暂停的作业都可以通过该命令恢复。 ## 取消作业 ```shell ./bin/seatunnel.sh -can [ ...] ``` 该命令会取消指定作业,取消作业后,作业会被停止,作业的状态会变为`CANCELED`。 支持批量取消作业,可以一次取消多个作业。 被cancel的作业的所有断点信息都将被删除,无法通过seatunnel.sh -r <jobId>恢复。 ## 强制取消作业 ```shell ./bin/seatunnel.sh -f [ ...] ``` 该命令用于强制取消指定的作业。 作业被取消后,将立即停止执行,其状态将变更为 `CANCELED`。 该命令支持批量操作,可一次性强制取消多个作业。 被cancel的作业的所有断点信息都将被删除,无法通过seatunnel.sh -r <jobId>恢复。 **注意事项** - 当作业状态为 `DOING_SAVEPOINT` 且 Savepoint 未能成功完成时,启用强制取消(force 选项生效)将直接把作业状态设置为 CANCELED。 - 强制取消可能会导致 Checkpoint 或 Savepoint 数据不完整或处于不一致状态, 仅建议在异常或紧急情况下使用该操作。 ## 配置JVM参数 我们可以通过以下方式为 SeaTunnel Engine 客户端配置 JVM 参数: 1. 添加JVM参数到`$SEATUNNEL_HOME/config/jvm_client_options`文件中。 在 `$SEATUNNEL_HOME/config/jvm_client_options` 文件中修改 JVM 参数。请注意,该文件中的 JVM 参数将应用于使用 `seatunnel.sh` 提交的所有作业,包括 Local 模式和 Cluster 模式。 2. 在提交作业时添加 JVM 参数。例如,`sh bin/seatunnel.sh --config $SEATUNNEL_HOME/config/v2.batch.config.template -DJvmOption="-Xms2G -Xmx2G"` # 服务端命令行工具 SeaTunnel Engine 提供了服务端管理命令,用于启动、停止和管理 SeaTunnel Engine 集群节点。 ```shell sh bin/seatunnel-cluster.sh -h ``` 服务器命令支持以下参数: ```shell Usage: seatunnel-cluster.sh [options] Options: -cn, --cluster 集群名称 -d, --daemon 以守护进程模式运行 -r, --role 集群节点角色,支持 master、worker、master_and_worker (默认: master_and_worker) -m, --member 显示集群成员信息 -h, --help 显示帮助信息 ``` ## 启动集群 可以通过如下命令获取服务器命令的帮助信息: ```shell # 前台启动 sh bin/seatunnel-cluster.sh # 后台启动(守护进程模式) sh bin/seatunnel-cluster.sh -d ``` ## 查看集群成员信息 您可以使用以下命令查看集群成员信息: ```shell sh bin/seatunnel-cluster.sh -m -cn my_cluster ``` 该命令会输出集群中所有成员的详细信息,包括: - **Member ID(成员ID)**: 每个集群成员的唯一标识符 - **Address(地址)**: 成员的IP地址和端口 - **Role(角色)**: 成员角色(ACTIVE MASTER、MASTER 或 WORKER) - **Version(版本)**: 成员运行的 Hazelcast 版本 **输出示例:** ``` Member ID Address Role Version a1b2c3d4-e5f6-7890-abcd-ef1234567890 192.168.1.100:5701 ACTIVE MASTER 5.3.0 b2c3d4e5-f6g7-8901-bcde-f23456789012 192.168.1.101:5701 MASTER 5.3.0 c3d4e5f6-g7h8-9012-cdef-345678901234 192.168.1.102:5701 WORKER 5.3.0 ``` **注意**: 必须使用 `-cn` 参数指定集群名称。集群必须处于运行状态才能执行此命令。 ## 停止集群 SeaTunnel 提供了专门的停止脚本来关闭集群节点: ```shell sh bin/stop-seatunnel-cluster.sh -h ``` 停止命令支持以下参数: ```shell Usage: stop-seatunnel-cluster.sh [options] Options: -cn, --cluster 要关闭的集群名称 (默认: seatunnel_default_cluster) -h, --help 显示帮助信息 ``` ### 停止默认集群 ```shell # 停止默认集群 (seatunnel_default_cluster) sh bin/stop-seatunnel-cluster.sh ``` ### 停止指定集群 ```shell # 停止指定名称的集群 sh bin/stop-seatunnel-cluster.sh -cn my_cluster ``` ================================================ FILE: docs/zh/engines/zeta/web-ui.md ================================================ # Web UI ## 访问 在访问 web ui 之前我们需要开启 http rest api。首先需要在`seatunnel.yaml`配置文件中配置 ``` seatunnel: engine: http: enable-http: true port: 8080 ``` 然后访问 `http://ip:8080/#/overview` ## 概述 Apache SeaTunnel 的 Web UI 提供了一个友好的用户界面,用于监控和管理 SeaTunnel 作业。通过 Web UI,用户可以实时查看当前运行的作业、已完成的作业,以及集群中工作节点和管理节点的状态。主要功能模块包括 Jobs、Workers 和 Master,每个模块都提供了详细的状态信息和操作选项,帮助用户高效地管理和优化其数据处理流程。 ![overview.png](../../../images/ui/overview.png) ## 作业 ### 运行中的作业 “运行中的作业”模块列出了当前正在执行的所有 SeaTunnel 作业。用户可以查看每个作业的基本信息,包括作业 ID、提交时间、状态、执行时间等。点击具体作业可以查看更多详细信息,如任务分布、资源使用情况和日志输出,便于用户实时监控作业进度并及时处理潜在问题。 ![running.png](../../../images/ui/running.png) ![detail.png](../../../images/ui/detail.png) ### 已完成的作业 “已完成的作业”模块展示了所有已成功完成或失败的 SeaTunnel 作业。此部分提供了每个作业的执行结果、完成时间、耗时以及失败原因(如果有)。用户可以通过此模块回顾过去的作业记录,分析作业性能,进行故障排查或重复执行某些特定作业。 ![finished.png](../../../images/ui/finished.png) ## 工作节点 ### 工作节点信息 “工作节点”模块展示了集群中所有工作节点的详细信息,包括每个工作节点的地址、运行状态、CPU 和内存使用情况、正在执行的任务数量等。通过该模块,用户可以监控各个工作节点的健康状况,及时发现和处理资源瓶颈或节点故障,确保 SeaTunnel 集群的稳定运行。 ![workers.png](../../../images/ui/workers.png) ## 管理节点 ### 管理节点信息 “管理节点”模块提供了 SeaTunnel 集群中主节点的状态和配置信息。用户可以查看 Master 节点的地址、运行状态、负责的作业调度情况以及整体集群的资源分配情况。该模块帮助用户全面了解集群的核心管理部分,便于进行集群配置优化和故障排查。 ![master.png](../../../images/ui/master.png) ================================================ FILE: docs/zh/faq.md ================================================ # 常见问题解答 ## SeaTunnel 支持哪些数据来源和数据目的地? SeaTunnel 支持多种数据源来源和数据目的地,您可以在官网找到详细的列表: SeaTunnel 支持的数据来源(Source)列表:https://seatunnel.apache.org/docs/connectors/source SeaTunnel 支持的数据目的地(Sink)列表:https://seatunnel.apache.org/docs/connectors/sink ## SeaTunnel 是否支持批处理和流处理? SeaTunnel 支持批流一体,SeaTunnel 可以设置批处理和流处理两种模式。您可以根据具体的业务场景和需求选择合适的处理模式。批处理适合定时数据同步场景,而流处理适合实时同步和数据变更捕获 (CDC) 场景。 ## 使用 SeaTunnel 需要安装 Spark 或者 Flink 这样的引擎么? Spark 和 Flink 不是必需的,SeaTunnel 可以支持 Zeta、Spark 和 Flink 3 种作为同步引擎的选择,您可以选择之一就行,社区尤其推荐使用 Zeta 这种专为同步场景打造的新一代超高性能同步引擎。Zeta 被社区用户亲切的称为 “泽塔奥特曼”! 社区对 Zeta 的支持力度是最大的,功能也更丰富。 ## SeaTunnel 支持的数据转换功能有哪些? SeaTunnel 支持多种数据转换功能,包括字段映射、数据过滤、数据格式转换等。可以通过在配置文件中定义 `transform` 模块来实现数据转换。详情请参考 SeaTunnel [Transform 文档](https://seatunnel.apache.org/docs/transforms)。 ## SeaTunnel 是否可以自定义数据清洗规则? SeaTunnel 支持自定义数据清洗规则。可以在 `transform` 模块中配置自定义规则,例如清理脏数据、删除无效记录或字段转换。 ## SeaTunnel 是否支持实时增量同步? SeaTunnel 支持增量数据同步。例如通过 CDC 连接器实现对数据库的增量同步,适用于需要实时捕获数据变更的场景。 ## SeaTunnel 目前支持哪些数据源的 CDC ? 目前支持 MongoDB CDC、MySQL CDC、Opengauss CDC、Oracle CDC、PostgreSQL CDC、Sql Server CDC、TiDB CDC等,更多请查阅[Source](https://seatunnel.apache.org/docs/connectors/source)。 ## SeaTunnel CDC 同步需要的权限如何开启? 这样就可以了。 这里多说一句,连接器对应的 cdc 权限开启步骤在官网都有写,请参照 SeaTunnel 对应的官网操作即可 ## SeaTunnel 支持从 MySQL 备库进行 CDC 么?日志如何拉取? 支持,是通过订阅 MySQL binlog 日志方式到同步服务器上解析 binlog 日志方式进行 ## SeaTunnel 是否支持无主键表的 CDC 同步? 不支持无主键表的 cdc 同步。原因如下: 比如上游有 2 条一模一样的数据,然后上游删除或修改了一条,下游由于无法区分到底是哪条需要删除或修改,会出现这 2 条都被删除或修改的情况。 没主键要类似去重的效果本身有点儿自相矛盾,就像辨别西游记里的真假悟空,到底哪个是真的 ## SeaTunnel 是否支持自动建表? 在同步任务启动之前,可以为目标端已有的表结构选择不同的处理方案。是通过 `schema_save_mode` 参数来控制的。 `schema_save_mode` 有以下几种方式可选: - **`RECREATE_SCHEMA`**:当表不存在时会创建,若表已存在则删除并重新创建。 - **`CREATE_SCHEMA_WHEN_NOT_EXIST`**:当表不存在时会创建,若表已存在则跳过创建。 - **`ERROR_WHEN_SCHEMA_NOT_EXIST`**:当表不存在时会报错。 - **`IGNORE`**:忽略对表的处理。 目前很多 connector 已经支持了自动建表,请参考对应的 connector 文档,这里拿 Jdbc 举例,请参考 [Jdbc sink](https://seatunnel.apache.org/docs/connectors/sink/Jdbc#schema_save_mode-enum) ## SeaTunnel 是否支持数据同步任务开始前对已有数据进行处理? 在同步任务启动之前,可以为目标端已有的数据选择不同的处理方案。是通过 `data_save_mode` 参数来控制的。 `data_save_mode` 有以下几种可选项: - **`DROP_DATA`**:保留数据库结构,删除数据。 - **`APPEND_DATA`**:保留数据库结构,保留数据。 - **`CUSTOM_PROCESSING`**:用户自定义处理。 - **`ERROR_WHEN_DATA_EXISTS`**:当存在数据时,报错。 目前很多 connector 已经支持了对已有数据进行处理,请参考对应的 connector 文档,这里拿 Jdbc 举例,请参考 [Jdbc sink](https://seatunnel.apache.org/docs/connectors/sink/Jdbc#data_save_mode-enum) ## SeaTunnel 是否支持精确一致性管理? SeaTunnel 支持一部分数据源的精确一致性,例如支持 MySQL、PostgreSQL 等数据库的事务写入,确保数据在同步过程中的一致性,另外精确一致性也要看数据库本身是否可以支持 ## SeaTunnel 可以定期执行任务吗? 您可以通过使用 linux 自带 cron 能力来实现定时数据同步任务,也可以结合 DolphinScheduler 等调度工具实现复杂的定时任务管理。 ## 我有一个问题,我自己无法解决 我在使用 SeaTunnel 时遇到了问题,无法自行解决。 我应该怎么办?有以下几种方式 1、在[问题列表](https://github.com/apache/seatunnel/issues)或[邮件列表](https://lists.apache.org/list.html?dev@seatunnel.apache.org)中搜索看看是否有人已经问过同样的问题并得到答案。 2、如果您找不到问题的答案,您可以通过[这些方式](https://github.com/apache/seatunnel#contact-us)联系社区成员寻求帮助。 3、中国用户可以添加微信群助手:seatunnel1,加入社区交流群,也欢迎大家关注微信公众号:seatunnel。 ## 如何声明变量? 您想知道如何在 SeaTunnel 的配置中声明一个变量,然后在运行时动态替换该变量的值吗? 该功能常用于定时或非定时离线处理,以替代时间、日期等变量。 用法如下: 在配置中配置变量名称。 下面是一个sql转换的例子(实际上,配置文件中任何地方“key = value”中的值都可以使用变量替换): ``` ... transform { Sql { query = "select * from dual where city ='${city}' and dt = '${date}'" } } ... ``` 以使用 SeaTunnel Zeta Local模式为例,启动命令如下: ```bash $SEATUNNEL_HOME/bin/seatunnel.sh \ -c $SEATUNNEL_HOME/config/your_app.conf \ -m local[2] \ -i city=Singapore \ -i date=20231110 ``` 您可以使用参数“-i”或“--variable”后跟“key=value”来指定变量的值,其中key需要与配置中的变量名称相同。详情可以参考:https://seatunnel.apache.org/docs/introduction/concepts/config ## 如何在配置文件中写入多行文本的配置项? 当配置的文本很长并且想要将其换行时,您可以使用三个双引号来指示其开始和结束: ``` var = """ Apache SeaTunnel is a next-generation high-performance, distributed, massive data integration tool. """ ``` ## 如何实现多行文本的变量替换? 在多行文本中进行变量替换有点麻烦,因为变量不能包含在三个双引号中: ``` var = """ your string 1 """${you_var}""" your string 2""" ``` 请参阅:[lightbend/config#456](https://github.com/lightbend/config/issues/456)。 ## 如果想学习 SeaTunnel 的源代码,应该从哪里开始? SeaTunnel 拥有完全抽象、结构化的非常优秀的架构设计和代码实现,很多用户都选择 SeaTunnel 作为学习大数据架构的方式。 您可以从`seatunnel-examples`模块开始了解和调试源代码:SeaTunnelEngineLocalExample.java 具体参考:https://seatunnel.apache.org/docs/developer/setup 针对中国用户,如果有伙伴想贡献自己的一份力量让 SeaTunnel 更好,特别欢迎加入社区贡献者种子群,欢迎添加微信:davidzollo,添加时请注明 "参与开源共建", 群仅仅用于技术交流, 重要的事情讨论还请发到 dev@seatunnel.apache.org 邮件里进行讨论。 ## 如果想开发自己的 source、sink、transform 时,是否需要了解 SeaTunnel 所有源代码? 不需要,您只需要关注 source、sink、transform 对应的接口即可。 如果你想针对 SeaTunnel API 开发自己的连接器(Connector V2),请查看**[Connector Development Guide](https://github.com/apache/seatunnel/blob/dev/seatunnel-connectors-v2/README.zh.md)** 。 ================================================ FILE: docs/zh/getting-started/docker/docker.md ================================================ --- sidebar_position: 3 --- # 使用Docker进行部署 ## 使用Docker启用本地模式 ### Zeta 引擎 #### 下载镜像 ```shell docker pull apache/seatunnel: ``` 当下载完成后,可以使用如下命令来提交任务 ```shell # Run fake source to console sink docker run --rm -it apache/seatunnel: ./bin/seatunnel.sh -m local -c config/v2.batch.config.template # Run job with custom config file docker run --rm -it -v //:/config apache/seatunnel: ./bin/seatunnel.sh -m local -c /config/fake_to_console.conf # Example # If you config file is in /tmp/job/fake_to_console.conf docker run --rm -it -v /tmp/job/:/config apache/seatunnel: ./bin/seatunnel.sh -m local -c /config/fake_to_console.conf # Set JVM options when running docker run --rm -it -v /tmp/job/:/config apache/seatunnel: ./bin/seatunnel.sh -DJvmOption="-Xms4G -Xmx4G" -m local -c /config/fake_to_console.conf ``` #### 自己构建镜像 从源代码构建。下载源码的方式和下载二进制包的方式是一样的。 你可以从[下载地址](https://seatunnel.apache.org/download/)下载源码, 或者从[GitHub 仓库](https://github.com/apache/seatunnel/releases)克隆源代码 ##### 一个命令来构建容器 ```shell cd seatunnel # Use already sett maven profile mvn -B clean install -Dmaven.test.skip=true -Dmaven.javadoc.skip=true -Dlicense.skipAddThirdParty=true -D"docker.build.skip"=false -D"docker.verify.skip"=false -D"docker.push.skip"=true -D"docker.tag"=3.0.0 -Dmaven.deploy.skip -D"skip.spotless"=true --no-snapshot-updates -Pdocker,seatunnel # Check the docker image docker images | grep apache/seatunnel ``` ##### 分步骤构建 ```shell # Build binary package from source code mvn clean package -DskipTests -Dskip.spotless=true # Build docker image cd seatunnel-dist docker build -f src/main/docker/Dockerfile --build-arg VERSION=3.0.0 -t apache/seatunnel:3.0.0 . # If you build from dev branch, you should add SNAPSHOT suffix to the version docker build -f src/main/docker/Dockerfile --build-arg VERSION=3.0.0-SNAPSHOT -t apache/seatunnel:3.0.0-SNAPSHOT . # Check the docker image docker images | grep apache/seatunnel ``` Dockerfile文件内容为: ```dockerfile FROM openjdk:8 ARG VERSION # Build from Source Code And Copy it into image COPY ./target/apache-seatunnel-${VERSION}-bin.tar.gz /opt/ # Download From Internet # Please Note this file only include fake/console connector, You'll need to download the other connectors manually # wget -P /opt https://dlcdn.apache.org/seatunnel/${VERSION}/apache-seatunnel-${VERSION}-bin.tar.gz RUN cd /opt && \ tar -zxvf apache-seatunnel-${VERSION}-bin.tar.gz && \ mv apache-seatunnel-${VERSION} seatunnel && \ rm apache-seatunnel-${VERSION}-bin.tar.gz && \ sed -i 's/#rootLogger.appenderRef.consoleStdout.ref/rootLogger.appenderRef.consoleStdout.ref/' seatunnel/config/log4j2.properties && \ sed -i 's/#rootLogger.appenderRef.consoleStderr.ref/rootLogger.appenderRef.consoleStderr.ref/' seatunnel/config/log4j2.properties && \ sed -i 's/rootLogger.appenderRef.file.ref/#rootLogger.appenderRef.file.ref/' seatunnel/config/log4j2.properties && \ cp seatunnel/config/hazelcast-master.yaml seatunnel/config/hazelcast-worker.yaml WORKDIR /opt/seatunnel ``` ### Spark/Flink引擎 #### 挂载 Spark/Flink 默认设值下,Spark的目录为`/opt/spark`, Flink的目录为 `/opt/flink`. 如果你需要运行Spark或Flink引擎,你需要将相关依赖挂载到`/opt/spark`或`/opt/flink`目录下. ```shell docker run \ -v :/opt/spark \ -v :/opt/flink \ ... ``` 或者你可以在Dockerfile中修改 `SPARK_HOME`, `FLINK_HOME`环境变量,并且重新构建基础镜像,然后再进行挂载. ```dockerfile FROM apache/seatunnel ENV SPARK_HOME= ... ``` ```shell docker run \ -v : \ ... ``` ### 提交任务 不同引擎和同一引擎的不同版本命令不同,请选择正确的命令。 - Spark ```shell # spark2 docker run --rm -it apache/seatunnel bash ./bin/start-seatunnel-spark-2-connector-v2.sh -c config/v2.batch.config.template # spark3 docker run --rm -it apache/seatunnel bash ./bin/start-seatunnel-spark-3-connector-v2.sh -c config/v2.batch.config.template ``` - Flink 在提交作业之前,您需要先启动 Flink 集群。 ```shell # flink version between `1.12.x` and `1.14.x` docker run --rm -it apache/seatunnel bash -c '/bin/start-cluster.sh && ./bin/start-seatunnel-flink-13-connector-v2.sh -c config/v2.streaming.conf.template' # flink version between `1.15.x` and `1.16.x` docker run --rm -it apache/seatunnel bash -c '/bin/start-cluster.sh && ./bin/start-seatunnel-flink-15-connector-v2.sh -c config/v2.streaming.conf.template' ``` ## 使用Docker配置集群模式 docker下的集群模式仅支持Zeta引擎 有两种方式来启动集群 ### 直接使用Docker #### 创建一个network ```shell docker network create seatunnel-network ``` #### 启动节点 - 启动master节点 ```shell ## start master and export 5801 port docker run -d --name seatunnel_master \ --network seatunnel-network \ --rm \ -p 5801:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r master ``` - 获取容器的ip ```shell docker inspect seatunnel_master ``` 运行此命令获取master容器的ip - 启动worker节点 ```shell # 将ST_DOCKER_MEMBER_LIST设置为master容器的ip docker run -d --name seatunnel_worker_1 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ## 启动第二个worker节点 # 将ST_DOCKER_MEMBER_LIST设置为master容器的ip docker run -d --name seatunnel_worker_2 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ``` #### 集群扩容 ```shell # 将ST_DOCKER_MEMBER_LIST设置为已经启动的master容器的ip docker run -d --name seatunnel_master \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r master ``` 运行这个命令创建一个worker节点 ```shell # 将ST_DOCKER_MEMBER_LIST设置为master容器的ip docker run -d --name seatunnel_worker_1 \ --network seatunnel-network \ --rm \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ apache/seatunnel \ ./bin/seatunnel-cluster.sh -r worker ``` ### 使用docker-compose `docker-compose.yaml` 配置文件为: ```yaml version: '3.8' services: master: image: apache/seatunnel container_name: seatunnel_master environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r master " ports: - "5801:5801" networks: seatunnel_network: ipv4_address: 172.16.0.2 worker1: image: apache/seatunnel container_name: seatunnel_worker_1 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.3 worker2: image: apache/seatunnel container_name: seatunnel_worker_2 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.4 networks: seatunnel_network: driver: bridge ipam: config: - subnet: 172.16.0.0/24 ``` 运行 `docker-compose up`命令来启动集群,该配置会启动一个master节点,2个worker节点 启动完成后,可以运行`docker logs -f seatunnel_master`, `docker logs -f seatunnel_worker_1`来查看节点的日志 当你访问`http://localhost:5801/hazelcast/rest/maps/system-monitoring-information` 时,可以看到集群的状态为1个master节点,2个worker节点. #### 集群扩容 当你需要对集群扩容, 例如需要添加一个worker节点时 ```yaml version: '3.8' services: master: image: apache/seatunnel container_name: seatunnel_master environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r master " ports: - "5801:5801" networks: seatunnel_network: ipv4_address: 172.16.0.2 worker1: image: apache/seatunnel container_name: seatunnel_worker_1 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.3 worker2: image: apache/seatunnel container_name: seatunnel_worker_2 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.4 #### ## 添加新节点配置 #### worker3: image: apache/seatunnel container_name: seatunnel_worker_3 environment: - ST_DOCKER_MEMBER_LIST=172.16.0.2,172.16.0.3,172.16.0.4,172.16.0.5 # 添加ip到这里 entrypoint: > /bin/sh -c " /opt/seatunnel/bin/seatunnel-cluster.sh -r worker " depends_on: - master networks: seatunnel_network: ipv4_address: 172.16.0.5 # 设置新节点ip networks: seatunnel_network: driver: bridge ipam: config: - subnet: 172.16.0.0/24 ``` 然后运行`docker-compose up -d`命令, 将会新建一个worker节点, 已有的节点不会重启. ### 提交作业到集群 #### 使用docker container作为客户端 - 提交任务 ```shell # 将ST_DOCKER_MEMBER_LIST设置为master容器的ip docker run --name seatunnel_client \ --network seatunnel-network \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ --rm \ apache/seatunnel \ ./bin/seatunnel.sh -c config/v2.batch.config.template ``` - 查看作业列表 ```shell # 将ST_DOCKER_MEMBER_LIST设置为master容器的ip docker run --name seatunnel_client \ --network seatunnel-network \ -e ST_DOCKER_MEMBER_LIST=172.18.0.2:5801 \ --rm \ apache/seatunnel \ ./bin/seatunnel.sh -l ``` 更多其他命令请参考[命令行工具](../../engines/zeta/user-command.md) #### 使用RestAPI 请参考 [提交作业](../../engines/zeta/rest-api-v2.md#提交作业) ================================================ FILE: docs/zh/getting-started/kubernetes/helm.md ================================================ --- sidebar_position: 4 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # 使用Helm部署 使用Helm快速部署Seatunnel集群。 ## 准备 我们假设您的本地已经安装如下软件: - [docker](https://docs.docker.com/) - [kubernetes](https://kubernetes.io/) - [helm](https://helm.sh/docs/intro/quickstart/) 在您的本地环境中能够正常执行`kubectl`和`helm`命令。 以 [minikube](https://minikube.sigs.k8s.io/docs/start/) 为例, 您可以使用如下命令启动一个集群: ```bash minikube start --kubernetes-version=v1.23.3 ``` ## 安装 使用默认配置安装 ```bash # Choose the corresponding version yourself export VERSION=2.3.10 helm pull oci://registry-1.docker.io/apache/seatunnel-helm --version ${VERSION} tar -xvf seatunnel-helm-${VERSION}.tgz cd seatunnel-helm helm install seatunnel . ``` 如果您需要使用其他命名空间进行安装。 ``` helm install seatunnel . -n ``` ## 提交任务 当前默认的配置没有启用ingress,所以需要使用转发命令将master的restapi端口转发出来。 ```bash kubectl port-forward -n default svc/seatunnel-master 5801:5801 ``` 然后可以通过地址 "http://127.0.0.1/5801/" 访问master的restapi。 如果想要使用ingress, 需要更新 `value.yaml` 例如: ```commandline ingress: enabled: true host: "" ``` 然后更新seatunnel。 就可以使用域名`http://`进行访问了。 或者您可以直接进入master的POD执行curl命令。. ```commandline # 获取其中一个master pod MASTER_POD=$(kubectl get po -l 'app.kubernetes.io/name=seatunnel-master' | sed '1d' | awk '{print $1}' | head -n1) # 进入master pod kubectl -n default exec -it $MASTER_POD -- /bin/bash # 执行 restapi curl http://127.0.0.1:5801/running-jobs curl http://127.0.0.1:5801/system-monitoring-information ``` 后面就可以使用[rest-api-v2](../../engines/zeta/rest-api-v2.md)提交任务了。 ## 下一步 到现在为止,您已经安装好Seatunnel集群了,你可以查看Seatunnel有哪些[连接器](../../connectors). 或者选择其他方式 [部署](../../engines/zeta/deployment.md). ================================================ FILE: docs/zh/getting-started/kubernetes/kubernetes.mdx ================================================ --- sidebar_position: 4 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # 使用 Kubernetes 部署 本部分提供了使用 SeaTunnel 与 Kubernetes 的快速指南。 ## 前置条件 我们假设您已经在本地安装了以下内容: - [docker](https://docs.docker.com/) - [kubernetes](https://kubernetes.io/) - [helm](https://helm.sh/docs/intro/quickstart/) 以便 `kubectl` 和 `helm` 命令在您的本地系统上可用。 以 kubernetes [minikube](https://minikube.sigs.k8s.io/docs/start/) 为例,您可以使用以下命令启动集群: ```bash minikube start --kubernetes-version=v1.23.3 ``` ## 安装 ### SeaTunnel Docker 镜像 要使用 SeaTunnel 运行镜像,首先创建一个 `Dockerfile`: ```Dockerfile FROM flink:1.13 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` 然后运行以下命令来构建镜像: ```bash docker build -t seatunnel:3.0.0-flink-1.13 -f Dockerfile . ``` 镜像 `seatunnel:3.0.0-flink-1.13` 需要存在于主机(minikube)中,以便部署可以进行。 通过以下方式将镜像加载到 minikube: ```bash minikube image load seatunnel:3.0.0-flink-1.13 ``` ```Dockerfile FROM openjdk:8 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` 然后运行以下命令来构建镜像: ```bash docker build -t seatunnel:3.0.0 -f Dockerfile . ``` 镜像 `seatunnel:3.0.0` 需要存在于主机(minikube)中,以便部署可以进行。 通过以下方式将镜像加载到 minikube: ```bash minikube image load seatunnel:3.0.0 ``` ```Dockerfile FROM openjdk:8 ENV SEATUNNEL_VERSION="3.0.0" ENV SEATUNNEL_HOME="/opt/seatunnel" RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} RUN mkdir -p $SEATUNNEL_HOME/logs RUN cd ${SEATUNNEL_HOME} && sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` 然后运行以下命令来构建镜像: ```bash docker build -t seatunnel:3.0.0 -f Dockerfile . ``` 镜像 `seatunnel:3.0.0` 需要存在于主机(minikube)中,以便部署可以进行。 通过以下方式将镜像加载到 minikube: ```bash minikube image load seatunnel:3.0.0 ``` ### 部署操作员 以下步骤提供了设置 Flink Kubernetes Operator 的快速演练。 您可以参考 [Flink Kubernetes Operator - Quick Start](https://nightlies.apache.org/flink/flink-kubernetes-operator-docs-main/docs/try-flink-kubernetes-operator/quick-start/) 了解更多详情。 > 注意:以下所有 Kubernetes 资源都在默认命名空间中创建。 在您的 Kubernetes 集群上安装证书管理器以启用添加 webhook 组件(每个 Kubernetes 集群只需一次): ```bash kubectl create -f https://github.com/jetstack/cert-manager/releases/download/v1.8.2/cert-manager.yaml ``` 现在您可以使用包含的 Helm chart 部署最新稳定的 Flink Kubernetes Operator 版本: ```bash helm repo add flink-operator-repo https://downloads.apache.org/flink/flink-kubernetes-operator-1.3.1/ helm install flink-kubernetes-operator flink-operator-repo/flink-kubernetes-operator \ --set image.repository=apache/flink-kubernetes-operator ``` 您可以通过 `kubectl` 验证您的安装: ```bash kubectl get pods NAME READY STATUS RESTARTS AGE flink-kubernetes-operator-5f466b8549-mgchb 1/1 Running 3 (23h ago) 16d ``` ## 运行 SeaTunnel 应用 **运行应用**:SeaTunnel 已经提供了开箱即用的 [配置](https://github.com/apache/seatunnel/tree/dev/config)。 在本指南中,我们将使用 [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { plugin_output = "fake" row.num = 160000 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` 在 Kubernetes 中为 seatunnel.streaming.conf 生成一个名为 seatunnel-config 的 configmap,以便我们可以在 pod 中挂载配置内容。 ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` 一旦 Flink Kubernetes Operator 按照前面的步骤运行,您就可以提交一个 Flink(SeaTunnel)作业: - 创建 `seatunnel-flink.yaml` FlinkDeployment 清单: ```yaml apiVersion: flink.apache.org/v1beta1 kind: FlinkDeployment metadata: name: seatunnel-flink-streaming-example spec: image: seatunnel:3.0.0-flink-1.13 flinkVersion: v1_13 flinkConfiguration: taskmanager.numberOfTaskSlots: "2" serviceAccount: flink jobManager: replicas: 1 resource: memory: "1024m" cpu: 1 taskManager: resource: memory: "1024m" cpu: 1 podTemplate: spec: containers: - name: flink-main-container volumeMounts: - name: seatunnel-config mountPath: /data/seatunnel.streaming.conf subPath: seatunnel.streaming.conf volumes: - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf job: jarURI: local:///opt/seatunnel/starter/seatunnel-flink-13-starter.jar entryClass: org.apache.seatunnel.core.starter.flink.SeaTunnelFlink args: ["--config", "/data/seatunnel.streaming.conf"] parallelism: 2 upgradeMode: stateless ``` - 运行示例应用: ```bash kubectl apply -f seatunnel-flink.yaml ``` 在本指南中,我们将使用 [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Console { } } ``` 在 Kubernetes 中为 seatunnel.streaming.conf 生成一个名为 seatunnel-config 的 configmap,以便我们可以在 pod 中挂载配置内容。 ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` - 创建 `seatunnel.yaml`: ```yaml apiVersion: v1 kind: Pod metadata: name: seatunnel spec: containers: - name: seatunnel image: seatunnel:3.0.0 command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel.sh --config /data/seatunnel.streaming.conf -e local"] resources: limits: cpu: "1" memory: 4G requests: cpu: "1" memory: 2G volumeMounts: - name: seatunnel-config mountPath: /data/seatunnel.streaming.conf subPath: seatunnel.streaming.conf volumes: - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf ``` - 运行示例应用: ```bash kubectl apply -f seatunnel.yaml ``` 在本指南中,我们将使用 [seatunnel.streaming.conf](https://github.com/apache/seatunnel/blob/3.0.0-release/config/v2.streaming.conf.template): ```conf env { parallelism = 2 job.mode = "STREAMING" checkpoint.interval = 2000 } source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } sink { Console { } } ``` 在 Kubernetes 中为 seatunnel.streaming.conf 生成一个名为 seatunnel-config 的 configmap,以便我们可以在 pod 中挂载配置内容。 ```bash kubectl create cm seatunnel-config \ --from-file=seatunnel.streaming.conf=seatunnel.streaming.conf ``` 然后,我们使用以下命令将 seatunnel 集群使用的一些配置文件加载到 configmap 中 在本地创建 yaml 文件如下 - 创建 `hazelcast-client.yaml`: ```yaml hazelcast-client: cluster-name: seatunnel properties: hazelcast.logging.type: log4j2 network: cluster-members: - localhost:5801 ``` - 创建 `hazelcast.yaml`: ```yaml hazelcast: cluster-name: seatunnel network: rest-api: enabled: true endpoint-groups: CLUSTER_WRITE: enabled: true DATA: enabled: true join: tcp-ip: enabled: true member-list: - localhost port: auto-increment: false port: 5801 properties: hazelcast.invocation.max.retry.count: 20 hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 ``` - 创建 `seatunnel.yaml`: ```yaml seatunnel: engine: history-job-expire-minutes: 1440 backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 print-job-metrics-info-interval: 60 slot-service: dynamic-slot: true checkpoint: interval: 10000 timeout: 60000 storage: type: hdfs max-retained: 3 plugin-config: namespace: /tmp/seatunnel/checkpoint_snapshot storage.type: hdfs fs.defaultFS: file:///tmp/ # 确保目录具有写入权限 ``` 使用以下命令为配置文件创建 configmaps ```bash kubectl create configmap hazelcast-client --from-file=hazelcast-client.yaml kubectl create configmap hazelcast --from-file=hazelcast.yaml kubectl create configmap seatunnelmap --from-file=seatunnel.yaml ``` 部署 Reloader 以实现热部署 我们在这里使用 Reloader 在修改配置文件或进行其他修改时自动重启 pod。您也可以直接给出配置文件的值,不使用 Reloader - [Reloader](https://github.com/stakater/Reloader/) ```bash wget https://raw.githubusercontent.com/stakater/Reloader/master/deployments/kubernetes/reloader.yaml kubectl apply -f reloader.yaml ``` - 创建 `seatunnel-cluster.yml`: ```yaml apiVersion: v1 kind: Service metadata: name: seatunnel spec: selector: app: seatunnel ports: - port: 5801 name: seatunnel clusterIP: None --- apiVersion: apps/v1 kind: StatefulSet metadata: name: seatunnel annotations: configmap.reloader.stakater.com/reload: "hazelcast,hazelcast-client,seatunnelmap" spec: serviceName: "seatunnel" replicas: 3 # 根据您的情况修改副本数 selector: matchLabels: app: seatunnel template: metadata: labels: app: seatunnel spec: containers: - name: seatunnel image: seatunnel:3.0.0 imagePullPolicy: IfNotPresent ports: - containerPort: 5801 name: client command: ["/bin/sh","-c","/opt/seatunnel/bin/seatunnel-cluster.sh -DJvmOption=-Xms2G -Xmx2G"] resources: limits: cpu: "1" memory: 4G requests: cpu: "1" memory: 2G volumeMounts: - mountPath: "/opt/seatunnel/config/hazelcast.yaml" name: hazelcast subPath: hazelcast.yaml - mountPath: "/opt/seatunnel/config/hazelcast-client.yaml" name: hazelcast-client subPath: hazelcast-client.yaml - mountPath: "/opt/seatunnel/config/seatunnel.yaml" name: seatunnelmap subPath: seatunnel.yaml - mountPath: /data/seatunnel.streaming.conf name: seatunnel-config subPath: seatunnel.streaming.conf volumes: - name: hazelcast configMap: name: hazelcast items: - key: hazelcast.yaml path: hazelcast.yaml - name: hazelcast-client configMap: name: hazelcast-client items: - key: hazelcast-client.yaml path: hazelcast-client.yaml - name: seatunnelmap configMap: name: seatunnelmap items: - key: seatunnel.yaml path: seatunnel.yaml - name: seatunnel-config configMap: name: seatunnel-config items: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf ``` - 运行示例应用: ```bash kubectl apply -f seatunnel-cluster.yml ``` **查看输出** 您可以在成功启动后跟踪您的作业日志(在新环境中可能需要大约一分钟,之后需要几秒钟),您可以: ```bash kubectl logs -f deploy/seatunnel-flink-streaming-example ``` 看起来如下: ```shell ... 2023-01-31 12:13:54,349 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from SCHEDULED to DEPLOYING. 2023-01-31 12:13:56,684 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Deploying Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (attempt #0) with attempt id 1665d2d011b2f6cf6525c0e5e75ec251 to seatunnel-flink-streaming-example-taskmanager-1-1 @ 100.103.244.106 (dataPort=39137) with allocation id fbe162650c4126649afcdaff00e46875 2023-01-31 12:13:57,794 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from DEPLOYING to INITIALIZING. 2023-01-31 12:13:58,203 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph [] - Source: SeaTunnel FakeSource -> Sink Writer: Console (1/1) (1665d2d011b2f6cf6525c0e5e75ec251) switched from INITIALIZING to RUNNING. ``` 如果日志中出现 OOM 错误,您可以在 seatunnel.streaming.conf 中减少 `row.num` 值 要公开 Flink Dashboard,您可以添加端口转发规则: ```bash kubectl port-forward svc/seatunnel-flink-streaming-example-rest 8081 ``` 现在可以在 [localhost:8081](http://localhost:8081) 访问 Flink Dashboard。 或启动 `minikube dashboard` 以获得基于 Web 的 Kubernetes 用户界面。 TaskManager Stdout 日志中打印的内容: ```bash kubectl logs \ -l 'app in (seatunnel-flink-streaming-example), component in (taskmanager)' \ --tail=-1 \ -f ``` 看起来如下(您的内容可能不同,因为我们使用 `FakeSource` 自动生成随机流数据): ```shell ... subtaskIndex=0: row=159991 : VVgpp, 978840000 subtaskIndex=0: row=159992 : JxrOC, 1493825495 subtaskIndex=0: row=159993 : YmCZR, 654146216 subtaskIndex=0: row=159994 : LdmUn, 643140261 subtaskIndex=0: row=159995 : tURkE, 837012821 subtaskIndex=0: row=159996 : uPDfd, 2021489045 subtaskIndex=0: row=159997 : mjrdG, 2074957853 subtaskIndex=0: row=159998 : xbeUi, 864518418 subtaskIndex=0: row=159999 : sSWLb, 1924451911 subtaskIndex=0: row=160000 : AuPlM, 1255017876 ``` 要停止您的作业并删除您的 FlinkDeployment,您可以简单地: ```bash kubectl delete -f seatunnel-flink.yaml ``` 您可以在成功启动后跟踪您的作业日志(在新环境中可能需要大约一分钟,之后需要几秒钟),您可以: ```bash kubectl logs -f seatunnel ``` 看起来如下(您的内容可能不同,因为我们使用 `FakeSource` 自动生成随机流数据): ```shell ... 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25673: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : hRJdE, 1295862507 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25674: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kXlew, 935460726 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25675: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : FrNOT, 1714358118 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25676: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : kSajX, 126709414 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25677: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YhpQv, 2020198351 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25678: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : nApin, 691339553 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25679: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : KZNNa, 1720773736 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25680: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : uCUBI, 490868386 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25681: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : oTLmO, 98770781 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25682: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : UECud, 835494636 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25683: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : XNegY, 1602828896 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25684: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LcFBx, 1400869177 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25685: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : EqSfF, 1933614060 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25686: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : BODIs, 1839533801 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25687: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : doxcI, 970104616 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25688: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IEVYn, 371893767 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25689: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : YXYfq, 1719257882 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25690: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : LFWEm, 725033360 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25691: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : ypUrY, 1591744616 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25692: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rlnzJ, 412162913 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25693: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : zWKnt, 976816261 2023-10-07 08:20:12,797 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=25694: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : PXrsk, 43554541 ``` 要停止您的作业并删除您的 FlinkDeployment,您可以简单地: ```bash kubectl delete -f seatunnel.yaml ``` 您可以在成功启动后跟踪您的作业日志(在新环境中可能需要大约一分钟,之后需要几秒钟),您可以: ```bash kubectl exec -it seatunnel-1 -- tail -f /opt/seatunnel/logs/seatunnel-engine-server.log | grep ConsoleSinkWriter ``` 看起来如下(您的内容可能不同,因为我们使用 `FakeSource` 自动生成随机流数据): ```shell ... 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=7: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : IibHk, 820962465 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=8: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lmKdb, 1072498088 2023-10-10 08:05:07,283 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=9: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : iqGva, 918730371 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=10: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : JMHmq, 1130771733 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=11: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : rxoHF, 189596686 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=12: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : OSblw, 559472064 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=13: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : yTZjG, 1842482272 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=14: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : RRiMg, 1713777214 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=15: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : lRcsd, 1626041649 2023-10-10 08:05:07,284 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=1 rowIndex=16: SeaTunnelRow#tableId= SeaTunnelRow#kind=INSERT : QrNNW, 41355294 ``` 要停止您的作业并删除您的 FlinkDeployment,您可以简单地: ```bash kubectl delete -f seatunnel-cluster.yaml ``` 祝您 SeaTunnel 使用愉快! ## 更多内容 现在,您已经快速了解了 SeaTunnel,您可以查看 [连接器](../../connector-v2/source) 以找到 SeaTunnel 支持的所有源和汇。 或者如果您想在另一种引擎集群中提交您的应用程序,请查看 [部署](../deployment.mdx)。 ================================================ FILE: docs/zh/getting-started/locally/deployment.md ================================================ --- sidebar_position: 1 --- import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; # 部署 ## 准备工作 在开始本地运行前,您需要确保您已经安装了SeaTunnel所需要的以下软件: * 安装[Java](https://www.java.com/en/download/) (Java 8 或 11, 其他高于Java 8的版本理论上也可以工作) 以及设置 `JAVA_HOME`。 ## 下载 SeaTunnel 发行包 ### 下载二进制包 进入[SeaTunnel下载页面](https://seatunnel.apache.org/download)下载最新版本的二进制安装包`seatunnel--bin.tar.gz` 或者您也可以通过终端下载: ```shell export version="3.0.0" wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` ### 下载连接器插件 从2.2.0-beta版本开始,二进制包不再默认提供连接器依赖,因此在第一次使用时,您需要执行以下命令来安装连接器:(当然,您也可以从 [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) 手动下载连接器,然后将其移动至`connectors/`目录下,如果是2.3.5之前则需要放入`connectors/seatunnel`目录下)。 ```bash sh bin/install-plugin.sh ``` 如果您需要指定的连接器版本,以3.0.0为例,您需要执行如下命令: ```bash sh bin/install-plugin.sh 3.0.0 ``` 通常情况下,你不需要所有的连接器插件。你可以通过配置`config/plugin_config`来指定所需的插件。例如,如果你想让示例应用程序正常工作,你将需要`connector-console`和`connector-fake`插件。你可以修改`plugin_config`配置文件,如下所示: ```plugin_config --seatunnel-connectors-- connector-fake connector-console --end-- ``` 您可以在`${SEATUNNEL_HOME}/connectors/plugins-mapping.properties`下找到所有支持的连接器和相应的plugin_config配置名称。 :::tip 提示 如果您想通过手动下载连接器的方式来安装连接器插件,则需要下载您所需要的连接器插件即可,并将它们放在`${SEATUNNEL_HOME}/connectors/`目录下。 ::: ## 从源码构建SeaTunnel ### 下载源码 从源码构建SeaTunnel。下载源码的方式与下载二进制包的方式相同。 您可以从[下载页面](https://seatunnel.apache.org/download/)下载源码,或者从[GitHub仓库](https://github.com/apache/seatunnel/releases)克隆源码。 ### 构建源码 ```shell cd seatunnel sh ./mvnw clean install -DskipTests -Dskip.spotless=true # 获取构建好的二进制包 cp seatunnel-dist/target/apache-seatunnel-3.0.0-bin.tar.gz /The-Path-You-Want-To-Copy cd /The-Path-You-Want-To-Copy tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` 当从源码构建时,所有的连接器插件和一些必要的依赖(例如:mysql驱动)都包含在二进制包中。您可以直接使用连接器插件,而无需单独安装它们。 # 启动SeaTunnel 现在您已经下载了SeaTunnel二进制包和连接器插件。接下来,您可以选择不同的引擎选项来运行同步任务。 如果您使用Flink来运行同步任务,则无需部署SeaTunnel引擎服务集群。您可以参考[Flink 引擎快速开始](quick-start-flink.md)来运行您的同步任务。 如果您使用Spark来运行同步任务,则无需部署SeaTunnel引擎服务集群。您可以参考[Spark 引擎快速开始](quick-start-spark.md)来运行您的同步任务。 如果您使用内置的SeaTunnel引擎(Zeta)来运行任务,则需要先部署SeaTunnel引擎服务。请参考[SeaTunnel 引擎快速开始](quick-start-seatunnel-engine.md)。 ================================================ FILE: docs/zh/getting-started/locally/quick-start-flink.md ================================================ --- sidebar_position: 3 --- # Flink 引擎快速开始 ## 步骤 1: 部署SeaTunnel及连接器 在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 ## 步骤 2: 部署并配置Flink 请先[下载Flink](https://flink.apache.org/downloads.html)(**需要版本 >= 1.12.0**)。更多信息您可以查看[入门: Standalone模式](https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/deployment/resource-providers/standalone/overview/) **配置SeaTunnel**: 修改`config/seatunnel-env.sh`中的设置,将`FLINK_HOME`配置设置为Flink的部署目录。 ## 步骤 3: 添加作业配置文件来定义作业 编辑`config/v2.streaming.conf.template`,它决定了SeaTunnel启动后数据输入、处理和输出的方式及逻辑。 下面是配置文件的示例,它与上面提到的示例应用程序相同。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` 关于配置的更多信息请查看[配置的基本概念](../../introduction/concepts/config.md) ## 步骤 4: 运行SeaTunnel应用程序 您可以通过以下命令启动应用程序: Flink版本`1.12.x`到`1.14.x` ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template ``` Flink版本`1.15.x`到`1.18.x` ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template ``` **查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 SeaTunnel控制台将会打印一些如下日志信息: ```shell fields : name, age types : STRING, INT row=1 : elWaB, 1984352560 row=2 : uAtnp, 762961563 row=3 : TQEIB, 2042675010 row=4 : DcFjo, 593971283 row=5 : SenEb, 2099913608 row=6 : DHjkg, 1928005856 row=7 : eScCM, 526029657 row=8 : sgOeE, 600878991 row=9 : gwdvw, 1951126920 row=10 : nSiKE, 488708928 row=11 : xubpl, 1420202810 row=12 : rHZqb, 331185742 row=13 : rciGD, 1112878259 row=14 : qLhdI, 1457046294 row=15 : ZTkRx, 1240668386 row=16 : SGZCr, 94186144 ``` ## 此外 - 开始编写您自己的配置文件,选择您想要使用的[连接器](../../connectors/source),并根据连接器的文档配置参数。 - 如果您想要了解更多关于SeaTunnel运行在Flink上的信息,请参阅[基于Flink的SeaTunnel](../../engines/flink.md)。 - SeaTunnel有内置的`Zeta`引擎,它是作为SeaTunnel的默认引擎。您可以参考[快速开始](quick-start-seatunnel-engine.md)配置和运行数据同步作业。 ================================================ FILE: docs/zh/getting-started/locally/quick-start-seatunnel-engine.md ================================================ --- sidebar_position: 2 --- # SeaTunnel 引擎快速开始 ## 步骤 1: 部署SeaTunnel及连接器 在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 ## 步骤 2: 添加作业配置文件来定义作业 编辑`config/v2.batch.config.template`,它决定了当seatunnel启动后数据输入、处理和输出的方式及逻辑。 下面是配置文件的示例,它与上面提到的示例应用程序相同。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` 关于配置的更多信息请查看[配置的基本概念](../../introduction/concepts/config.md) ## 步骤 3: 运行SeaTunnel应用程序 您可以通过以下命令启动应用程序: :::tip 从2.3.1版本开始,seatunnel.sh中的-e参数被废弃,请改用-m参数。 ::: ```shell cd "apache-seatunnel-${version}" ./bin/seatunnel.sh --config ./config/v2.batch.config.template -m local ``` **查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 SeaTunnel控制台将会打印一些如下日志信息: ```shell 2022-12-19 11:01:45,417 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - output rowType: name, age 2022-12-19 11:01:46,489 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=1: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CpiOd, 8520946 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=2: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: eQqTs, 1256802974 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=3: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: UsRgO, 2053193072 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=4: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jDQJj, 1993016602 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=5: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: rqdKp, 1392682764 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=6: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: wCoWN, 986999925 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=7: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: qomTU, 72775247 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=8: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: jcqXR, 1074529204 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=9: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: AkWIO, 1961723427 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=10: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: hBoib, 929089763 2022-12-19 11:01:46,490 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=11: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: GSvzm, 827085798 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=12: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: NNAYI, 94307133 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=13: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: EexFl, 1823689599 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=14: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: CBXUb, 869582787 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=15: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: Wbxtm, 1469371353 2022-12-19 11:01:46,491 INFO org.apache.seatunnel.connectors.seatunnel.console.sink.ConsoleSinkWriter - subtaskIndex=0 rowIndex=16: SeaTunnelRow#tableId=-1 SeaTunnelRow#kind=INSERT: mIJDt, 995616438 ``` ## 扩展示例:从 MySQL 到 Doris 批处理模式 ### 步骤1:下载连接器 首先,您需要在`${SEATUNNEL_HOME}/config/plugin_config`文件中加入连接器名称,然后,执行命令来安装连接器(当然,您也可以从 [Apache Maven Repository](https://repo.maven.apache.org/maven2/org/apache/seatunnel/) 手动下载连接器,然后将其移动至`connectors/`目录下),最后,确认连接器`connector-jdbc`、`connector-doris`在`${SEATUNNEL_HOME}/connectors/`目录下即可。 ```bash # 配置连接器名称 --seatunnel-connectors-- connector-jdbc connector-doris --end-- ``` ```bash # 安装连接器 sh bin/install-plugin.sh ``` ### 步骤2:放入 MySQL 驱动 您需要下载 [jdbc driver jar package](https://mvnrepository.com/artifact/mysql/mysql-connector-java) 驱动,并放置在 `${SEATUNNEL_HOME}/lib/`目录下 ### 步骤3:添加作业配置文件来定义作业 ```bash cd seatunnel/job/ vim st.conf env { parallelism = 2 job.mode = "BATCH" } source { Jdbc { url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" connection_check_timeout_sec = 100 user = "user" password = "pwd" table_path = "test.table_name" query = "select * from test.table_name" } } sink { Doris { fenodes = "doris_ip:8030" username = "user" password = "pwd" database = "test_db" table = "table_name" sink.enable-2pc = "true" sink.label-prefix = "test-cdc" doris.config = { format = "json" read_json_by_line="true" } } } ``` 关于配置的更多信息请查看[配置的基本概念](../../introduction/concepts/config.md) ### 步骤 4: 运行SeaTunnel应用程序 您可以通过以下命令启动应用程序: ```shell cd seatunnel/ ./bin/seatunnel.sh --config ./job/st.conf -m local ``` **查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 SeaTunnel控制台将会打印一些如下日志信息: ```shell *********************************************** Job Statistic Information *********************************************** Start Time : 2024-08-13 10:21:49 End Time : 2024-08-13 10:21:53 Total Time(s) : 4 Total Read Count : 1000 Total Write Count : 1000 Total Failed Count : 0 *********************************************** ``` :::tip 如果您想优化自己的作业,请参照连接器使用文档 ::: ## 此外 - 开始编写您自己的配置文件,选择您想要使用的[连接器](../../connectors/source),并根据连接器的文档配置参数。 - 如果您想要了解更多关于信息,请参阅[SeaTunnel引擎](../../engines/zeta/about.md). 在这里你将了解如何部署SeaTunnel Engine的集群模式以及如何在集群模式下使用。 ================================================ FILE: docs/zh/getting-started/locally/quick-start-spark.md ================================================ --- sidebar_position: 4 --- # Spark 引擎快速开始 ## 步骤 1: 部署SeaTunnel及连接器 在开始前,请确保您已经按照[部署](deployment.md)中的描述下载并部署了SeaTunnel。 ## 步骤 2: 部署并配置Spark 请先[下载Spark](https://spark.apache.org/downloads.html)(**需要版本 >= 2.4.0**)。 更多信息您可以查看[入门: Standalone模式](https://spark.apache.org/docs/latest/spark-standalone.html#installing-spark-standalone-to-a-cluster) **配置SeaTunnel**: 修改`config/seatunnel-env.sh`中的设置,它是基于你的引擎在[部署](deployment.md)时的安装路径。 将`SPARK_HOME`修改为Spark的部署目录。 ## 步骤 3: 添加作业配置文件来定义作业 编辑`config/v2.streaming.conf.template`,它决定了当SeaTunnel启动后数据输入、处理和输出的方式及逻辑。 下面是配置文件的示例,它与上面提到的示例应用程序相同。 ```hocon env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 16 schema = { fields { name = "string" age = "int" } } } } transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { age = age name = new_name } } } sink { Console { plugin_input = "fake1" } } ``` 关于配置的更多信息请查看[配置的基本概念](../../introduction/concepts/config.md) ## 步骤 4: 运行SeaTunnel应用程序 您可以通过以下命令启动应用程序: Spark 2.4.x ```bash cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-2-connector-v2.sh \ --master local[4] \ --deploy-mode client \ --config ./config/v2.streaming.conf.template ``` Spark 3.x.x ```shell cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-3-connector-v2.sh \ --master local[4] \ --deploy-mode client \ --config ./config/v2.streaming.conf.template ``` **查看输出**: 当您运行该命令时,您可以在控制台中看到它的输出。您可以认为这是命令运行成功或失败的标志。 SeaTunnel控制台将会打印一些如下日志信息: ```shell fields : name, age types : STRING, INT row=1 : elWaB, 1984352560 row=2 : uAtnp, 762961563 row=3 : TQEIB, 2042675010 row=4 : DcFjo, 593971283 row=5 : SenEb, 2099913608 row=6 : DHjkg, 1928005856 row=7 : eScCM, 526029657 row=8 : sgOeE, 600878991 row=9 : gwdvw, 1951126920 row=10 : nSiKE, 488708928 row=11 : xubpl, 1420202810 row=12 : rHZqb, 331185742 row=13 : rciGD, 1112878259 row=14 : qLhdI, 1457046294 row=15 : ZTkRx, 1240668386 row=16 : SGZCr, 94186144 ``` ## 此外 - 开始编写您自己的配置文件,选择您想要使用的[连接器](../../connectors/source),并根据连接器的文档配置参数。 - 如果您想要了解更多关于SeaTunnel运行在Spark上的信息,请参阅[基于Spark的SeaTunnel](../../engines/spark.md)。 - SeaTunnel有内置的`Zeta`引擎,它是作为SeaTunnel的默认引擎。您可以参考[快速开始](quick-start-seatunnel-engine.md)配置和运行数据同步作业。 ================================================ FILE: docs/zh/introduction/about.md ================================================ # 关于 SeaTunnel seatunnel logo [![Slack](../../images/seatunnel-slack.svg)](https://s.apache.org/seatunnel-slack) [![Twitter Follow](../../images/ASFSeaTunnel.svg)](https://x.com/ASFSeaTunnel) SeaTunnel是一个多模态、超高性能、分布式的海量数据集成工具,每天可稳定高效同步数百亿数据,已被数千家企业应用于生产,以其高效和稳定性深受众多企业信赖。 ## 为什么需要 SeaTunnel SeaTunnel专注于数据集成和数据同步,主要旨在解决数据集成领域的常见问题: * **数据源多样**:常用数据源有数百种,版本不兼容。 随着新技术的出现,更多的数据源不断出现。 用户很难找到一个能够全面、快速支持这些数据源的工具。 * **多模态数据集成**:除了结构化数据外,用户还需要集成视频、图像、二进制文件、结构化和非结构化文本数据。 但是,现有的数据集成工具主要集中在结构化数据上。 * **同步场景复杂**:数据同步需要支持离线全量同步、离线增量同步、CDC、实时同步、全库同步等多种同步场景。 * **资源需求高**:现有的数据集成和数据同步工具往往需要大量的计算资源或JDBC连接资源来完成海量小表的实时同步。 这增加了企业的负担。 * **缺乏质量和监控**:数据集成和同步过程经常会出现数据丢失或重复的情况。 同步过程缺乏监控,无法直观了解任务过程中数据的真实情况。 * **技术栈复杂**:企业使用的技术组件不同,用户需要针对不同组件开发相应的同步程序来完成数据集成。 * **管理和维护困难**:受限于底层技术组件(Flink/Spark)不同,离线同步和实时同步往往需要分开开发和管理,增加了管理和维护的难度。 ## SeaTunnel 相关特性 * **丰富且可扩展的Connector**:SeaTunnel提供了不依赖于特定执行引擎的Connector API。 基于该API开发的Connector(Source、Transform、Sink)可以运行在很多不同的引擎上,例如目前支持的SeaTunnel引擎(Zeta)、Flink、Spark等。 * **Connector插件**:插件式设计让用户可以轻松开发自己的Connector并将其集成到SeaTunnel项目中。 目前,SeaTunnel 支持超过 100 个连接器,并且数量正在激增。 * **批流集成**:基于SeaTunnel Connector API开发的Connector完美兼容离线同步、实时同步、全量同步、增量同步等场景。 它们大大降低了管理数据集成任务的难度。 * **分布式快照**:支持分布式快照算法,保证数据一致性。 * **多引擎支持**:SeaTunnel默认使用SeaTunnel引擎(Zeta)进行数据同步。 SeaTunnel还支持使用Flink或Spark作为Connector的执行引擎,以适应企业现有的技术组件。 SeaTunnel 支持 Spark 和 Flink 的多个版本。 * **JDBC复用、数据库日志多表解析**:SeaTunnel支持多表或全库同步,解决了过度JDBC连接的问题; 支持多表或全库日志读取解析,解决了CDC多表同步场景下需要处理日志重复读取解析的问题。 * **高吞吐量、低延迟**:SeaTunnel支持并行读写,提供稳定可靠、高吞吐量、低延迟的数据同步能力。 * **完善的实时监控**:SeaTunnel支持数据同步过程中每一步的详细监控信息,让用户轻松了解同步任务读写的数据数量、数据大小、QPS等信息。 * **支持两种作业开发方法**:编码和画布设计。 SeaTunnel Web 项目 https://github.com/apache/seatunnel-web 提供作业、调度、运行和监控功能的可视化管理。 ## SeaTunnel 工作流图 ![SeaTunnel Work Flowchart](../../images/architecture_diagram.png) SeaTunnel的运行流程如上图所示。 用户配置作业信息并选择提交作业的执行引擎。 Source Connector负责并行读取数据并将数据发送到下游Transform或直接发送到Sink,Sink将数据写入目的地。 值得注意的是,Source、Transform 和 Sink 可以很容易地自行开发和扩展。 SeaTunnel 是一个 EtL(T) 数据集成工具。 因此,在SeaTunnel中,transform(t)只能用于对数据进行一些简单的转换,例如将一列的数据转换为大写或小写,更改列名,或者将一列拆分为多列。 SeaTunnel 使用的默认引擎是 [SeaTunnel Zeta Engine](../engines/zeta/about.md)。 如果您选择使用Flink或Spark引擎,SeaTunnel会将Connector打包成Flink或Spark程序并提交给Flink或Spark运行。 ## 连接器 - **源连接器** SeaTunnel 支持从各种关系、图形、NoSQL、文档和内存数据库读取数据; 分布式文件系统,例如HDFS; 以及各种云存储解决方案,例如S3和OSS。 我们还支持很多常见SaaS服务的数据读取。 您可以在[此处] 访问详细列表。 如果您愿意,您可以开发自己的源连接器并将其轻松集成到 SeaTunnel 中。 - **转换连接器** 如果源和接收器之间的架构不同,您可以使用转换连接器更改从源读取的架构,使其与接收器架构相同。 - **Sink Connector** SeaTunnel 支持将数据写入各种关系型、图形、NoSQL、文档和内存数据库; 分布式文件系统,例如HDFS; 以及各种云存储解决方案,例如S3和OSS。 我们还支持将数据写入许多常见的 SaaS 服务。 您可以在[此处]访问详细列表。 如果您愿意,您可以开发自己的 Sink 连接器并轻松将其集成到 SeaTunnel 中。 ## 谁在使用 SeaTunnel SeaTunnel 拥有大量用户。 您可以在[用户](https://seatunnel.apache.org/user)中找到有关他们的更多信息. ## 展望



      

    SeaTunnel 丰富了CNCF 云原生景观

    ## 了解更多 您可以参阅[快速入门](../getting-started/locally/deployment.md) 了解后续相关步骤。 ================================================ FILE: docs/zh/introduction/concepts/config.md ================================================ # 配置文件简介 在SeaTunnel中,最重要的事情就是配置文件,尽管用户可以自定义他们自己的数据同步需求以发挥SeaTunnel最大的潜力。那么接下来我将会向你介绍如何设置配置文件。 配置文件的主要格式是 `hocon`, 有关该格式类型的更多信息你可以参考[HOCON-GUIDE](https://github.com/lightbend/config/blob/main/HOCON.md), 顺便提一下,我们也支持 `json`格式,但你应该知道配置文件的名称应该是以 `.json`结尾。 我们同时提供了以 `SQL` 格式,详细可以参考[SQL配置文件](../configuration/sql-config.md)。 ## 例子 在你阅读之前,你可以在发布包中的config目录[这里](https://github.com/apache/seatunnel/tree/dev/config)找到配置文件的例子。 ## 配置文件结构 配置文件类似下面这个例子: :::caution 警告 旧的配置名称 `result_table_name`/`source_table_name` 已经过时,请尽快迁移到新名称 `plugin_output`/`plugin_input`。 ::: ### hocon ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } transform { Filter { plugin_input = "fake" plugin_output = "fake1" fields = [name, card] } } sink { Clickhouse { host = "clickhouse:8123" database = "default" table = "seatunnel_console" fields = ["name", "card"] username = "default" password = "" plugin_input = "fake1" } } ``` 正如你看到的,配置文件包括几个部分:env, source, transform, sink。不同的模块具有不同的功能。 当你了解了这些模块后,你就会懂得SeaTunnel到底是如何工作的。 ### env 用于添加引擎可选的参数,不管是什么引擎(Zeta、Spark 或者 Flink),对应的可选参数应该在这里填写。 注意,我们按照引擎分离了参数,对于公共参数我们可以像以前一样配置。对于Flink和Spark引擎,其参数的具体配置规则可以参考[JobEnvConfig](../configuration/JobEnvConfig.md)。 ### source source用于定义SeaTunnel在哪儿检索数据,并将检索的数据用于下一步。 可以同时定义多个source。目前支持的source请看[Source of SeaTunnel](../connectors/source)。每种source都有自己特定的参数用来 定义如何检索数据,SeaTunnel也抽象了每种source所使用的参数,例如 `plugin_output` 参数,用于指定当前source生成的数据的名称, 方便后续其他模块使用。 ### transform 当我们有了数据源之后,我们可能需要对数据进行进一步的处理,所以我们就有了transform模块。当然,这里使用了“可能”这个词, 这意味着我们也可以直接将transform视为不存在,直接从source到sink,像下面这样: ```hocon env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { name = "string" age = "int" card = "int" } } } } sink { Clickhouse { host = "clickhouse:8123" database = "default" table = "seatunnel_console" fields = ["name", "age", "card"] username = "default" password = "" plugin_input = "fake" } } ``` 与source类似, transform也有属于每个模块的特定参数。目前支持的source请看。目前支持的transform请看 [Transform V2 of SeaTunnel](../transform-v2) ### sink 我们使用SeaTunnel的作用是将数据从一个地方同步到其它地方,所以定义数据如何写入,写入到哪里是至关重要的。通过SeaTunnel提供的 sink模块,你可以快速高效地完成这个操作。Sink和source非常相似,区别在于读取和写入。所以去看看我们[Sink of SeaTunnel](../connectors/sink)吧。 ### 其它 你会疑惑当定义了多个source和多个sink时,每个sink读取哪些数据,每个transform读取哪些数据?我们使用`plugin_output` 和 `plugin_input` 两个配置。每个source模块都会配置一个`plugin_output`来指示数据源生成的数据源名称,其它transform和sink 模块可以使用`plugin_input` 引用相应的数据源名称,表示要读取数据进行处理。然后transform,作为一个中间的处理模块,可以同时使用 `plugin_output` 和 `plugin_input` 配置。但你会发现在上面的配置例子中,不是每个模块都配置了这些参数,因为在SeaTunnel中, 有一个默认的约定,如果这两个参数没有配置,则使用上一个节点的最后一个模块生成的数据。当只有一个source时这是非常方便的。 ## 多行文本支持 `hocon`支持多行字符串,这样就可以包含较长的文本段落,而不必担心换行符或特殊格式。这可以通过将文本括在三层引号 **`"""`** 中来实现。例如: ``` var = """ Apache SeaTunnel is a next-generation high-performance, distributed, massive data integration tool. """ sql = """ select * from "table" """ ``` ## Json格式支持 在编写配置文件之前,请确保配置文件的名称应以 `.json` 结尾。 ```json { "env": { "job.mode": "batch" }, "source": [ { "plugin_name": "FakeSource", "plugin_output": "fake", "row.num": 100, "schema": { "fields": { "name": "string", "age": "int", "card": "int" } } } ], "transform": [ { "plugin_name": "Filter", "plugin_input": "fake", "plugin_output": "fake1", "fields": ["name", "card"] } ], "sink": [ { "plugin_name": "Clickhouse", "host": "clickhouse:8123", "database": "default", "table": "seatunnel_console", "fields": ["name", "card"], "username": "default", "password": "", "plugin_input": "fake1" } ] } ``` ## 配置变量替换 在配置文件中,我们可以定义一些变量并在运行时替换它们。但是注意仅支持 hocon 格式的文件。 变量使用方法: - `${varName}`,如果变量未传值,则抛出异常。 - `${varName:default}`,如果变量未传值,则使用默认值。如果设置默认值则变量需要写在双引号中。 - `${varName:}`,如果变量未传值,则使用空字符串。 如果您不通过`-i`设置变量值,也可以通过设置系统的环境变量传值,变量替换支持通过环境变量获取变量值。 例如,您可以在shell脚本中设置环境变量如下: ```shell export varName="value with space" ``` 然后您可以在配置文件中使用变量。 如果您在配置文件中设置了没有默认值的变量,但在执行过程中未传递该变量,则会保留该变量值,系统不会抛出异常。但请您需要确保其他流程能够正确解析该变量值。例如,ElasticSearch的索引需要支持`${xxx}`这样的格式来动态指定索引。若其他流程不支持,程序可能无法正常运行。 具体样例: ```hocon env { job.mode = "BATCH" job.name = ${jobName} parallelism = 2 } source { FakeSource { plugin_output = "${resName:fake_test}_table" row.num = "${rowNum:50}" string.template = ${strTemplate} int.template = [20, 21] schema = { fields { name = "${nameType:string}" age = ${ageType} } } } } transform { sql { plugin_input = "${resName:fake_test}_table" plugin_output = "sql" query = "select * from ${resName:fake_test}_table where name = '${nameVal}' " } } sink { Console { plugin_input = "sql" username = ${username} password = ${password} } } ``` 在上述配置中,我们定义了一些变量,如 ${rowNum}、${resName}。 我们可以使用以下 shell 命令替换这些参数: ```shell ./bin/seatunnel.sh -c -i jobName='this_is_a_job_name' -i strTemplate=['abc','d~f','hi'] -i ageType=int -i nameVal=abc -i username=seatunnel=2.3.1 -i password='$a^b%c.d~e0*9(' -m local ``` 其中 `resName`,`rowNum`,`nameType` 我们未设置,他将获取默认值 然后最终提交的配置是: ```hocon env { job.mode = "BATCH" job.name = "this_is_a_job_name" parallelism = 2 } source { FakeSource { plugin_output = "fake_test_table" row.num = 50 string.template = ['abc','d~f','hi'] int.template = [20, 21] schema = { fields { name = "string" age = "int" } } } } transform { sql { plugin_input = "fake_test_table" plugin_output = "sql" query = "select * from dual where name = 'abc' " } } sink { Console { plugin_input = "sql" username = "seatunnel=2.3.1" password = "$a^b%c.d~e0*9(" } } ``` 一些注意事项: - 如果值包含特殊字符,如`(`,请使用`'`引号将其括起来。 - 如果替换变量包含`"`或`'`(如`"resName"`和`"nameVal"`),需要添加`"`。 - 值不能包含空格`' '`。例如, `-i jobName='this is a job name'`将被替换为`job.name = "this"`。 你可以使用环境变量传递带有空格的值。 - 如果要使用动态参数,可以使用以下格式: `-i date=$(date +"%Y%m%d")`。 - 不能使用指定系统保留字符,它将不会被`-i`替换,如:`${database_name}`、`${schema_name}`、`${table_name}`、`${schema_full_name}`、`${table_full_name}`、`${primary_key}`、`${unique_key}`、`${field_names}`、`${partition_keys}`。具体可参考[Sink参数占位符](../configuration/sink-options-placeholders.md) ## 此外 如果你想了解更多关于格式配置的详细信息,请查看 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md)。 ================================================ FILE: docs/zh/introduction/concepts/connector-v2-features.md ================================================ # Connector V2 功能简介 ## Connector V2 和 V1 之间的不同 从 https://github.com/apache/seatunnel/issues/1608 我们添加了 Connector V2 特性。 Connector V2 是基于SeaTunnel Connector API接口定义的连接器。不像Connector V1, V2 支持如下特性: * **多引擎支持** SeaTunnel Connector API 是引擎独立的API。基于这个API开发的连接器可以在多个引擎上运行。目前支持Flink和Spark引擎,后续我们会支持其它的引擎。 * **多引擎版本支持** 通过翻译层将连接器与引擎解耦,解决了大多数连接器需要修改代码才能支持新版本底层引擎的问题。 * **流批一体** Connector V2 可以支持批处理和流处理。我们不需要为批和流分别开发连接器。 * **多路复用JDBC/Log连接。** Connector V2支持JDBC资源复用和共享数据库日志解析。 * **多模态数据集成** Connector V2 支持多模态数据集成,包括结构化和非结构化文本数据、视频、图像、二进制文件等。 ## Source Connector 特性 Source connector有一些公共的核心特性,每个source connector在不同程度上支持它们。 ### 精确一次(exactly-once) 如果数据源中的每条数据仅由源向下游发送一次,我们认为该source connector支持精确一次(exactly-once)。 在SeaTunnel中, 我们可以保存读取的 **Split** 和它的 **offset**(当时读取的数据被分割时的位置,例如行号, 字节大小, 偏移量等) 作为检查点时的 **StateSnapshot** 。 如果任务重新启动, 我们会得到最后的 **StateSnapshot** 然后定位到上次读取的 **Split** 和 **offset**,继续向下游发送数据。 例如 `File`, `Kafka`。 ### 列投影(column projection) 如果连接器支持仅从数据源读取指定列(请注意,如果先读取所有列,然后通过元数据(schema)过滤不需要的列,则此方法不是真正的列投影) 例如 `JDBCSource` 可以使用sql定义读取列。 `KafkaSource` 从主题中读取所有内容然后使用`schema`过滤不必要的列, 这不是真正的`列投影`。 ### 批(batch) 批处理作业模式,读取的数据是有界的,当所有数据读取完成后作业将停止。 ### 流(stream) 流式作业模式,数据读取无界,作业永不停止。 ### 并行性(parallelism) 并行执行的Source Connector支持配置 `parallelism`,每个并发会创建一个任务来读取数据。 在**Parallelism Source Connector**中,source会被分割成多个split,然后枚举器会将 split 分配给 SourceReader 进行处理。 ### 多模态(multimodal) 支持多模态数据集成,包括结构化和非结构化文本数据、视频、图像、二进制文件等。 ### 支持用户自定义split 用户可以配置分割规则。 ### 支持多表读取 支持在一个 SeaTunnel 作业中读取多个表。 ## Sink Connector 的特性 Sink connector有一些公共的核心特性,每个sink connector在不同程度上支持它们。 ### 精确一次(exactly-once) 当任意一条数据流入分布式系统时,如果系统在整个处理过程中仅准确处理任意一条数据一次,且处理结果正确,则认为系统满足精确一次一致性。 对于sink connector,如果任何数据只写入目标一次,则sink connector支持精确一次。 通常有两种方法可以实现这一目标: * 目标数据库支持key去重。例如 `MySQL`, `Kudu`。 * 目标支持 **XA 事务**(事务可以跨会话使用,即使创建事务的程序已经结束,新启动的程序也只需要知道最后一个事务的ID就可以重新提交或回滚事务)。 然后我们可以使用 **两阶段提交** 来确保 * 精确一次**。 例如:`File`, `MySQL`. ### cdc(更改数据捕获,change data capture) 如果sink connector支持基于主键写入行类型(INSERT/UPDATE_BEFORE/UPDATE_AFTER/DELETE),我们认为它支持cdc(更改数据捕获,change data capture)。 ### 支持多表读取 支持在一个 SeaTunnel 作业中写入多个表,用户可以通过[配置占位符](../configuration/sink-options-placeholders.md)动态指定表的标识符。 ### 多模态(multimodal) 支持多模态数据集成,包括结构化和非结构化文本数据、视频、图像、二进制文件等。 ================================================ FILE: docs/zh/introduction/concepts/gravitino-type-mapping.md ================================================ # Gravitino 类型映射 本文档描述了使用 Apache Gravitino 作为元数据源时,Gravitino 与 SeaTunnel 之间的类型映射关系。类型转换由 `GravitinoTableSchemaConvertor` 处理。 ## 概述 当 SeaTunnel 从 Gravitino 读取表结构时,Gravitino 的列类型会自动转换为对应的 SeaTunnel 数据类型。这种映射使得 Gravitino 管理的元数据能够无缝集成到 SeaTunnel 的数据处理管道中。 ## 基础类型映射 | Gravitino 类型 | Gravitino JSON 表示 | SeaTunnel 类型 | SeaTunnel 类型关键字 | Java 类型 | 说明 | |:-----------------|:-------------------|:--------------------------------------|:-----------------|:---------------------------|:--------------------------| | Boolean | `boolean` | `BasicType.BOOLEAN_TYPE` | `boolean` | `java.lang.Boolean` | 布尔类型 | | Byte | `byte` | `BasicType.BYTE_TYPE` | `tinyint` | `java.lang.Byte` | 1字节整数 | | Unsigned Byte | `byte unsigned` | `BasicType.BYTE_TYPE` | `tinyint` | `java.lang.Byte` | 无符号字节(unsigned标志被忽略) | | Short | `short` | `BasicType.SHORT_TYPE` | `smallint` | `java.lang.Short` | 2字节整数 | | Unsigned Short | `short unsigned` | `BasicType.SHORT_TYPE` | `smallint` | `java.lang.Short` | 无符号短整型(unsigned标志被忽略) | | Integer | `integer` | `BasicType.INT_TYPE` | `int` | `java.lang.Integer` | 4字节整数 | | Unsigned Integer | `integer unsigned` | `BasicType.INT_TYPE` | `int` | `java.lang.Integer` | 无符号整型(unsigned标志被忽略) | | Long | `long` | `BasicType.LONG_TYPE` | `bigint` | `java.lang.Long` | 8字节整数 | | Unsigned Long | `long unsigned` | `BasicType.LONG_TYPE` | `bigint` | `java.lang.Long` | 无符号长整型(unsigned标志被忽略) | | Float | `float` | `BasicType.FLOAT_TYPE` | `float` | `java.lang.Float` | 单精度浮点数 | | Double | `double` | `BasicType.DOUBLE_TYPE` | `double` | `java.lang.Double` | 双精度浮点数 | | Decimal | `decimal(p, s)` | `DecimalType(p, s)` | `"decimal(p,s)"` | `java.math.BigDecimal` | 精度: 1-38, 小数位: 0-精度 | | String | `string` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 变长字符串 | | FixedChar | `char(l)` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 定长字符串,长度存储在columnLength | | VarChar | `varchar(l)` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 变长字符串,最大长度存储在columnLength | | UUID | `uuid` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 通用唯一标识符 | | Date | `date` | `LocalTimeType.LOCAL_DATE_TYPE` | `date` | `java.time.LocalDate` | 日期(不含时间) | | Time | `time` | `LocalTimeType.LOCAL_TIME_TYPE` | `time` | `java.time.LocalTime` | 时间(不含日期) | | Timestamp | `timestamp(p)` | `LocalTimeType.LOCAL_DATE_TIME_TYPE` | `timestamp` | `java.time.LocalDateTime` | 不带时区的时间戳,p=0-12 | | TimestampTz | `timestamp_tz(p)` | `LocalTimeType.OFFSET_DATE_TIME_TYPE` | `timestamp_tz` | `java.time.OffsetDateTime` | 带时区的时间戳,p=0-12 | | Binary | `binary` | `PrimitiveByteArrayType.INSTANCE` | `bytes` | `byte[]` | 变长二进制数据 | | Fixed | `fixed(l)` | `PrimitiveByteArrayType.INSTANCE` | `bytes` | `byte[]` | 定长二进制数据 | | IntervalYear | `interval_year` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 年-月间隔 | | IntervalDay | `interval_day` | `BasicType.STRING_TYPE` | `string` | `java.lang.String` | 日-时间隔 | ## 复杂类型映射 | Gravitino 类型 | Gravitino JSON 表示 | SeaTunnel 类型 | SeaTunnel 类型关键字 | 说明 | |:-------------|:------------------------------------------------------------------------------------|:------------------------|:------------------------------------|:--------------------------| | List | `{"type": "list", "elementType": type, "containsNull": boolean}` | `ArrayType` | `"array"` | T为元素类型 | | Map | `{"type": "map", "keyType": type, "valueType": type, "valueContainsNull": boolean}` | `MapType` | `"map"` | K为键类型,V为值类型 | | Struct | `{"type": "struct", "fields": [...]}` | `SeaTunnelRowType` | `{field1=type1, field2=type2, ...}` | 嵌套行类型 | | External | `{"type": "external", "catalogString": "user-defined"}` | `BasicType.STRING_TYPE` | `string` | 不支持的类型(如PostgreSQL的jsonb) | | Union | `{"type": "union", "types": [...]}` | 不支持 | - | 抛出转换错误 | ## 类型参数提取 转换器会提取类型参数作为列元数据: | 类型 | 参数 | 提取为 | 说明 | |:------------------|:-----------------|:------------------------------------|:------------| | `decimal(p, s)` | precision, scale | columnLength=precision, scale=scale | 两个值都会存储 | | `varchar(l)` | length | columnLength=length | 字符串最大长度 | | `char(l)` | length | columnLength=length | 定长字符串长度 | | `fixed(l)` | length | columnLength=length | 定长二进制长度 | | `timestamp(p)` | precision | columnLength=precision | 小数秒精度(0-12) | | `timestamp_tz(p)` | precision | columnLength=precision | 小数秒精度(0-12) | ## 索引和约束映射 Gravitino 索引映射到 SeaTunnel 约束: | Gravitino 索引类型 | SeaTunnel 约束类型 | 说明 | |:---------------|:---------------------------|:--------------------| | `PRIMARY_KEY` | `PrimaryKey` | 从 fieldNames 数组提取列名 | | `UNIQUE_KEY` | `ConstraintKey.UNIQUE_KEY` | 列排序顺序默认为 ASC | ## 注意事项和限制 1. **大小写不敏感**:类型匹配不区分大小写。`BOOLEAN`、`boolean` 和 `Boolean` 被视为相同。 2. **无符号类型**:数值类型的 `unsigned` 修饰符会被识别,但不影响转换后的 SeaTunnel 类型。SeaTunnel 内部使用有符号类型。 3. **外部类型**:当 Gravitino 遇到无法解析的类型(如 PostgreSQL 的 `jsonb`)时,会将其表示为 `external` 类型。SeaTunnel 会将其转换为 `string` 类型。 4. **联合类型**:Gravitino 的 `union` 类型目前不支持,会抛出转换错误。 5. **可空性**:Gravitino 列定义中的 `nullable` 属性会保留在 SeaTunnel `Column` 元数据中。 6. **Decimal 参数**:`decimal` 类型必须同时指定精度和小数位参数。没有参数或格式无效的 decimal 值会抛出错误。 ## 相关文档 - [Gravitino 列类型](https://gravitino.apache.org/docs/1.1.0/manage-relational-metadata-using-gravitino/#apache-gravitino-table-column-type) - [Schema 特性](./schema-feature.md) - [SeaTunnel 数据类型](../common-options.md) ================================================ FILE: docs/zh/introduction/concepts/incompatible-changes.md ================================================ # 不向前兼容的更新 本文档记录了各版本之间不兼容的更新内容。在升级到相关版本前,请检查本文档。 ## dev ### API 变更 - **破坏性变更:Engine REST 表级指标 key 格式变化** - **影响范围**:SeaTunnel Engine REST API(`/job-info` 返回的 job metrics 中的表级指标) - **变更说明**:为支持多个 Source/Sink/Transform 同时处理同一张表,表级指标的 key 格式从 `{tableName}` 变更为 `{VertexIdentifier}.{tableName}`(例如 `Sink[0].fake.user_table`)。 - **影响**:依赖旧 key 的 Grafana 仪表盘、Prometheus 告警规则以及自定义监控解析逻辑需要同步修改,否则升级后会出现指标查询/告警静默失效。 **变更前** ```json { "TableSinkWriteCount": { "fake.user_table": "15" } } ``` **变更后** ```json { "TableSinkWriteCount": { "Sink[0].fake.user_table": "10", "Sink[1].fake.user_table": "5" } } ``` ### 配置变更 ### 连接器变更 ### 转换变更 - **[BREAKING]** SQL Transform 的 `PARSEDATETIME`、`TO_DATE` 和 `IS_DATE` 函数现在只接受白名单中的日期时间格式模式。以前接受的自定义格式模式现在将在运行时失败。支持的模式有: - DateTime: `yyyy-MM-dd HH:mm:ss`, `yyyy-MM-dd HH:mm:ss.SSS`, `yyyy-MM-dd'T'HH:mm:ss`, `yyyy-MM-dd'T'HH:mm:ss.SSS`, `yyyy/MM/dd HH:mm:ss`, `yyyy/MM/dd HH:mm:ss.SSS`, `yyyyMMddHHmmss` - Date: `yyyy-MM-dd`, `yyyy/MM/dd`, `yyyyMMdd` - Time: `HH:mm:ss`, `HH:mm:ss.SSS`, `HHmmss` **异常类型变更**: 无效的日期时间格式模式现在会抛出 `SeaTunnelRuntimeException` 而不是 `TransformException`。如果您的错误处理或监控系统捕获 `TransformException` 来处理日期时间解析错误,您需要更新它们以处理 `SeaTunnelRuntimeException`。 **迁移指南**: 如果您在 `PARSEDATETIME`、`TO_DATE` 或 `IS_DATE` 函数中使用自定义日期时间格式模式,您必须更新查询以使用上述支持的模式之一。如果您的数据使用不同的格式,您可能需要预处理输入数据以匹配支持的格式,或使用字符串操作函数在解析之前转换格式。 - DataValidator 转换:当 `row_error_handle_way = ROUTE_TO_TABLE` 时,路由到错误表的行 `table_id` 现在会携带上游的 database/schema 前缀(例如从 `ffp` 变为 `db1.ffp` / `db1.schema1.ffp`)。 ### 引擎行为变更 ### 依赖升级 ================================================ FILE: docs/zh/introduction/concepts/schema-feature.md ================================================ # Schema 特性简介 ## 为什么我们需要Schema 某些NoSQL数据库或消息队列没有严格限制schema,因此无法通过api获取schema。 这时需要定义一个schema来转换为TableSchema并获取数据。 ## SchemaOptions 我们可以使用SchemaOptions定义schema, SchemaOptions包含了一些定义schema的配置。 例如:columns, primaryKey, constraintKeys。 ``` schema = { table = "database.schema.table" schema_first = false comment = "comment" partition_keys = ["dt"] columns = [ ... ] primaryKey { ... } constraintKeys { ... } } ``` ### table schema所属的表标识符的表全名,包含数据库、schema、表名。 例如 `database.schema.table`、`database.table`、`table`。 ### schema_url 通过restApi获取元数据信息的http url,比如:`http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type` > 当使用 Gravitino 作为元数据源时,Gravitino 的列类型会自动转换为 SeaTunnel 数据类型。详细的类型映射信息请参考 [Gravitino 类型映射](./gravitino-type-mapping.md)。 #### schema_url 配置示例 **1. 单表配置,包含 table 和 schema_url 属性:** ```hocon source { LocalFile { path = "/tmp/data" file_format_type = "json" schema { table = "db.table2" schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } } ``` **2. 单表配置,仅使用 schema_url(不包含 table 属性):** ```hocon source { LocalFile { path = "/tmp/data" file_format_type = "json" schema { schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } } ``` **3. 多表配置,包含 columns 和 schema_url:** ```hocon source { LocalFile { tables_configs = [ { path = "/tmp/data/table1" file_format_type = "json" schema { table = "db.table1" columns = [ { name = id, type = bigint, nullable = false }, { name = name, type = string }, { name = age, type = int } ] } }, { path = "/tmp/data/table2" file_format_type = "json" schema { table = "db.table2" schema_url = "http://gravitino:8090/api/metalakes/test_metalake/catalogs/test_catalog/schemas/test_schema/tables/table2" } } ] } } ``` ### schema_first 默认是false。 如果schema_first是true, schema会优先使用, 这意味着如果我们设置 `table = "a.b"`, `a` 会被解析为schema而不是数据库, 那么我们可以支持写入 `table = "schema.table"`. ### comment schema所属的 CatalogTable 的注释。 ### partition_keys schema 所属的 CatalogTable 的分区字段列表。 该元数据可以配合 sink 端占位符 `${partition_keys}` 使用(例如多表同步写入 Iceberg 时按表创建分区表)。 ### Columns Columns 是用于定义模式中的列的配置列表,每列可以包含名称(name)、类型(type)、是否可空(nullable)、默认值(defaultValue)、注释(comment)字段。 ``` columns = [ { name = id type = bigint nullable = false columnLength = 20 defaultValue = 0 comment = "primary key id" } ] ``` | 字段 | 是否必须 | 默认值 | 描述 | |:-------------|:-----|:-----|--------------------| | name | Yes | - | 列的名称 | | type | Yes | - | 列的数据类型 | | nullable | No | true | 列是否可空 | | columnLength | No | 0 | 列的长度,当您需要定义长度时将很有用 | | columnScale | No | - | 列的精度,当您需要定义精度时将很有用 | | defaultValue | No | null | 列的默认值 | | comment | No | null | 列的注释 | #### 目前支持哪些类型 | 数据类型 | Java中的值类型 | 描述 | |:-------------|:---------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | string | `java.lang.String` | 字符串 | | boolean | `java.lang.Boolean` | 布尔 | | tinyint | `java.lang.Byte` | 常规-128 至 127 。 0 到 255 无符号*。 指定括号中的最大位数。 | | smallint | `java.lang.Short` | 常规-32768 至 32767。 0 到 65535 无符号*。 指定括号中的最大位数。 | | int | `java.lang.Integer` | 允许从 -2,147,483,648 到 2,147,483,647 的所有数字。 | | bigint | `java.lang.Long` | 允许 -9,223,372,036,854,775,808 和 9,223,372,036,854,775,807 之间的所有数字。 | | float | `java.lang.Float` | 从-1.79E+308 到 1.79E+308浮点精度数值数据。 | | double | `java.lang.Double` | 双精度浮点。 处理大多数小数。 | | decimal | `java.math.BigDecimal` | Double 类型存储为字符串,允许固定小数点。 | | null | `java.lang.Void` | null | | bytes | `byte[]` | 字节。 | | date | `java.time.LocalDate` | 仅存储日期。从0001年1月1日到9999 年 12 月 31 日。 | | time | `java.time.LocalTime` | 仅存储时间。精度为 100 纳秒。 | | timestamp | `java.time.LocalDateTime` | 存储不带时区的日期和时间信息,表示事件发生的本地时间。不包含任何偏移量或时区相关信息。 | | timestamp_tz | `java.time.OffsetDateTime` | 存储带有 UTC 偏移量的日期和时间信息,包含本地日期时间和 UTC 偏移量。在处理多时区场景时,可以提供更精确的时间信息。 | | row | `org.apache.seatunnel.api.table.type.SeaTunnelRowType` | 行类型,可以嵌套。 | | map | `java.util.Map` | Map 是将键映射到值的对象。 键类型包括: `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map` `row`. | | array | `ValueType[]` | 数组是一种表示元素集合的数据类型。 元素类型包括: `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double`. | #### 如何声明支持的类型 SeaTunnel 提供了一种简单直接的方式来声明基本类型。基本类型的关键字包括:`string`, `boolean`, `tinyint`, `smallint`, `int`, `bigint`, `float`, `double`, `date`, `time`, `timestamp`, 和 `null`。基本类型的关键字名称可以直接用作类型声明,并且SeaTunnel对类型关键字不区分大小写。 例如,如果您需要声明一个整数类型的字段,您可以简单地将字段定义为`int`或`"int"`。 > null 类型声明必须用双引号引起来, 例如:`"null"`。 这种方法有助于避免与 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) 中表示未定义的对象的 `null` 类型混淆。 声明复杂类型(例如 **decimal**、**array**、**map** 和 **row**)时,请注意具体注意事项。 - 声明decimal类型时,需要设置精度(precision)和小数位数(scale),类型定义遵循“decimal(precision, scale)”格式。 需要强调的是,十进制类型的声明必须用 `"` 括起来;不能像基本类型一样直接使用类型名称。例如,当声明精度为 10、小数位数为 2 的十进制字段时,您可以指定字段类型为`"decimal(10,2)"`。 - 声明array类型时,需要指定元素类型,类型定义遵循 `array` 格式,其中 `T` 代表元素类型。元素类型包括`int`,`string`,`boolean`,`tinyint`,`smallint`,`bigint`,`float` 和 `double`。与十进制类型声明类似,它也用 `"` 括起来。例如,在声明具有整数数组的字段时,将字段类型指定为 `"array"`。 - 声明map类型时,需要指定键和值类型。map类型定义遵循`map`格式,其中`K`表示键类型,`V`表示值类型。 `K`可以是任何基本类型和十进制类型,`V`可以是 SeaTunnel 支持的任何类型。 与之前的类型声明类似,map类型声明必须用双引号引起来。 例如,当声明一个map类型的字段时,键类型为字符串,值类型为整数,则可以将该字段声明为`"map"`。 - 声明row类型时,需要定义一个 [HOCON](https://github.com/lightbend/config/blob/main/HOCON.md) 对象来描述字段及其类型。 字段类型可以是 SeaTunnel 支持的任何类型。 例如,当声明包含整数字段“a”和字符串字段“b”的行类型时,可以将其声明为“{a = int, b = string}”。 将定义作为字符串括在 `"` 中也是可以接受的,因此 `"{a = int, b = string}"` 相当于 `{a = int, c = string}`。由于 HOCON 与 JSON 兼容, `"{\"a\":\"int\", \"b\":\"string\"}"` 等价于 `"{a = int, b = string}"`。 以下是复杂类型声明的示例: ```hocon schema { fields { c_decimal = "decimal(10, 2)" c_array = "array" c_row = { c_int = int c_string = string c_row = { c_int = int } } # 在泛型中Hocon风格声明行类型 map0 = "map" # 在泛型中Json风格声明行类型 map1 = "map" } } ``` ### 主键(PrimaryKey) 主键是用于定义模式中主键的配置,它包含name、columns字段。 ``` primaryKey { name = id columns = [id] } ``` | 字段 | 是否必须 | 默认值 | 描述 | |:--------|:-----|:----|---------| | name | 是 | - | 主键名称 | | columns | 是 | - | 主键中的列列表 | ### 约束键(constraintKeys) 约束键是用于定义模式中约束键的配置列表,它包含constraintName,constraintType,constraintColumns字段。 ``` constraintKeys = [ { constraintName = "id_index" constraintType = KEY constraintColumns = [ { columnName = "id" sortType = ASC } ] }, ] ``` | 字段 | 是否必须 | 默认值 | 描述 | |:------------------|:-----|:----|------------------------------------------------------------------------| | constraintName | 是 | - | 约束键的名称 | | constraintType | 否 | KEY | 约束键的类型 | | constraintColumns | 是 | - | PrimaryKey中的列列表,每列应包含constraintType和sortType,sortType支持ASC和DESC,默认为ASC | #### 目前支持哪些约束类型 | 约束类型 | 描述 | |:-----------|:----| | INDEX_KEY | 键 | | UNIQUE_KEY | 唯一键 | ## 多表Schema ``` tables_configs = [ { schema { table = "database.schema.table1" schema_first = false comment = "comment" columns = [ ... ] primaryKey { ... } constraintKeys { ... } } }, { schema = { table = "database.schema.table2" schema_first = false comment = "comment" columns = [ ... ] primaryKey { ... } constraintKeys { ... } } } ] ``` ## 如何使用schema ### 推荐 ``` source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema { table = "FakeDatabase.FakeTable" columns = [ { name = id type = bigint nullable = false defaultValue = 0 comment = "primary key id" }, { name = name type = "string" nullable = true comment = "name" }, { name = age type = int nullable = true comment = "age" } ] primaryKey { name = "id" columnNames = [id] } constraintKeys = [ { constraintName = "unique_name" constraintType = UNIQUE_KEY constraintColumns = [ { columnName = "name" sortType = ASC } ] }, ] } } } ``` ### 已弃用 如果你只需要定义列,你可以使用字段来定义列,这是一种简单的方式,但将来会被删除。 ``` source { FakeSource { parallelism = 2 plugin_output = "fake" row.num = 16 schema = { fields { id = bigint c_map = "map" c_array = "array" c_string = string c_boolean = boolean c_tinyint = tinyint c_smallint = smallint c_int = int c_bigint = bigint c_float = float c_double = double c_decimal = "decimal(2, 1)" c_bytes = bytes c_date = date c_timestamp = timestamp } } } } ``` ## 我们什么时候应该使用它,什么时候不应该使用它 如果选项中有`schema`配置项目,则连接器可以自定义schema。 比如 `Fake` `Pulsar` `Http` 源连接器等。 ================================================ FILE: docs/zh/introduction/configuration/JobEnvConfig.md ================================================ # JobEnvConfig 本文档描述了env的配置信息,公共参数可以在所有引擎中使用。为了更好的区分引擎参数,其他引擎的附加参数需要携带前缀。 在flink引擎中,我们使用`flink.`作为前缀。在spark引擎中,我们不使用任何前缀来修改参数,因为官方的spark参数本身就是以`spark.`开头。 ## 公共参数 以下配置参数对所有引擎通用: ### job.name 该参数配置任务名称。 ### jars 第三方包可以通过`jars`加载,例如:`jars="file://local/jar1.jar;file://local/jar2.jar"` ### job.mode 通过`job.mode`你可以配置任务是在批处理模式还是流处理模式。例如:`job.mode = "BATCH"` 或者 `job.mode = "STREAMING"` ### checkpoint.interval 获取定时调度检查点的时间间隔(毫秒)。 在`STREAMING`模式下,检查点是必须的,如果不设置,将从应用程序配置文件`seatunnel.yaml`中获取。 在`BATCH`模式下,您可以通过不设置此参数来禁用检查点。在Zeta `STREAMING`模式下,默认值为30000毫秒。 ### checkpoint.timeout 检查点的超时时间(毫秒)。如果检查点在超时之前没有完成,作业将失败。在Zeta中,默认值为30000毫秒。 ### parallelism 该参数配置source和sink的并行度。 ### shade.identifier 指定加密方式,如果您没有加密或解密配置文件的需求,此选项可以忽略。 更多详细信息,您可以参考文档 [Config Encryption Decryption](../connectors/Config-Encryption-Decryption.md) ## Zeta 引擎参数 ### job.retry.times 用于控制作业失败时的默认重试次数。默认值为3,并且仅适用于Zeta引擎。 ### job.retry.interval.seconds 用于控制作业失败时的默认重试间隔。默认值为3秒,并且仅适用于Zeta引擎。 ### savemode.execute.location 此参数用于指定在Zeta引擎中执行作业时SaveMode执行的时机。 默认值为`CLUSTER`,这意味着SaveMode在作业提交到集群上之后在集群上执行。 当值为`CLIENT`时,SaveMode操作在作业提交的过程中执行,使用shell脚本提交作业时,该过程在提交作业的shell进程中执行。使用rest api提交作业时,该过程在http请求的处理线程中执行。 请尽量使用`CLUSTER`模式,因为当`CLUSTER`模式没有问题时,我们将删除`CLIENT`模式。 ## Flink 引擎参数 这里列出了一些与 Flink 中名称相对应的 SeaTunnel 参数名称,并非全部,更多内容请参考官方 [Flink Documentation](https://flink.apache.org/) for more. | Flink 配置名称 | SeaTunnel 配置名称 | |---------------------------------|---------------------------------------| | pipeline.max-parallelism | flink.pipeline.max-parallelism | | execution.checkpointing.mode | flink.execution.checkpointing.mode | | execution.checkpointing.timeout | flink.execution.checkpointing.timeout | | ... | ... | ## Spark 引擎参数 由于Spark配置项并无调整,这里就不列出来了,请参考官方 [Spark Documentation](https://spark.apache.org/). ================================================ FILE: docs/zh/introduction/configuration/config-encryption-decryption.md ================================================ # 配置文件加密和解密 ## 介绍 在大多数生产环境中,需要对敏感的配置项(如密码)进行加密,不能以明文形式存储。SeaTunnel 为此提供了一个方便的一站式解决方案。 ## 如何使用 SeaTunnel 具备Base64编码和解码的功能,但不建议在生产环境中使用,SeaTunnel 建议用户根据自身需求,实现个性化的加密和解密逻辑。您可以参考本章节[如何实现用户自定义的加密和解密](#如何实现用户自定义的加密和解密)以获取更多相关细节。 Base64编码默认支持加密以下参数: - username - password - auth - token - access_key - secret_key 用户也可以在 `shade.options` 指定要用于加解密的参数. 接下来,将展示如何快速使用 SeaTunnel 自带的 `base64` 加密功能: 1. 在配置文件的环境变量(env)部分新增了选项 `shade.identifier` 和 `shade.options`。`shade.identifier`用于表示您想要使用的加密方法,`shade.options`用于指定您想加解密的参数。 2. 在这个示例中,我们在配置文件中添加了 `shade.identifier = base64`,如下所示: ```hocon # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # env { parallelism = 1 shade.identifier = "base64" shade.options = ["username", "password", "f1", "config.f1", "config2.list"] } source { MySQL-CDC { plugin_output = "fake" parallelism = 1 server-id = 5656 port = 56725 hostname = "127.0.0.1" username = "seatunnel" password = "seatunnel_password" database-name = "inventory_vwyw0n" table-name = "products" url = "jdbc:mysql://localhost:56725" f1 = "seatunnel" # custom shade options config1.f1 = "seatunnel" config2.list = ["seatunnel", "seatunnel", "seatunnel"] } } transform { } sink { # 将数据输出到 Clickhouse。 Clickhouse { host = "localhost:8123" database = "default" table = "fake_all" username = "seatunnel" password = "seatunnel_password" # cdc options primary_key = "id" support_upsert = true } } ``` 3. 通过Shell脚本调用不同的计算引擎来对配置文件进行加密操作。在本示例中,我们使用 Zeta 引擎对配置文件进行加密。 ```shell ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --encrypt ``` 然后,您可以在终端中看到加密后的配置文件。 ```log 2023-02-20 17:50:58,319 INFO org.apache.seatunnel.core.starter.command.ConfEncryptCommand - Encrypt config: { "env" : { "parallelism" : 1, "shade.identifier" : "base64" }, "source" : [ { "url" : "jdbc:mysql://localhost:56725", "hostname" : "127.0.0.1", "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", "port" : 56725, "database-name" : "inventory_vwyw0n", "parallelism" : 1, "plugin_output" : "fake", "table-name" : "products", "plugin_name" : "MySQL-CDC", "server-id" : 5656, "username" : "c2VhdHVubmVs", "f1" : "c2VhdHVubmVs", "config1.f1" : "c2VhdHVubmVs", "config2.list" : ["c2VhdHVubmVs","c2VhdHVubmVs","c2VhdHVubmVs"] } ], "transform" : [], "sink" : [ { "database" : "default", "password" : "c2VhdHVubmVsX3Bhc3N3b3Jk", "support_upsert" : true, "host" : "localhost:8123", "plugin_name" : "Clickhouse", "primary_key" : "id", "table" : "fake_all", "username" : "c2VhdHVubmVs" } ] } ``` 4. 当然,不仅支持加密配置文件,还支持对配置文件的解密。如果用户想要查看解密后的配置文件,可以执行以下命令: ```shell ${SEATUNNEL_HOME}/bin/seatunnel.sh --config config/v2.batch.template --decrypt ``` ## 如何实现用户自定义的加密和解密 如果您希望自定义加密方法和加密配置,本章节将帮助您解决问题。 1. 创建一个 java maven 项目 2. 在 maven 依赖中添加 `seatunnel-api` 模块,如下所示: ```xml org.apache.seatunnel seatunnel-api ${seatunnel.version} provided ``` 3. 创建一个 java 类并实现 `ConfigShade` 接口,该接口包含以下方法: ```java /** * The interface that provides the ability to encrypt and decrypt {@link * org.apache.seatunnel.shade.com.typesafe.config.Config} */ public interface ConfigShade { /** * The unique identifier of the current interface, used it to select the correct {@link * ConfigShade} */ String getIdentifier(); /** * Encrypt the content * * @param content The content to encrypt */ String encrypt(String content); /** * Decrypt the content * * @param content The content to decrypt */ String decrypt(String content); /** To expand the options that user want to encrypt */ default String[] sensitiveOptions() { return new String[0]; } } ``` 4. 在 `resources/META-INF/services` 目录下创建名为 `org.apache.seatunnel.api.configuration.ConfigShade`的文件, 文件内容是您在步骤 3 中定义的类的完全限定类名。 5. 将其打成 jar 包, 并添加到 `${SEATUNNEL_HOME}/lib` 目录下。 6. 将选项 `shade.identifier` 的值更改为上面定义在配置文件中的 `ConfigShade#getIdentifier` 的值。 ### 在加密解密方法中使用自定义参数 如果您想要使用自定义参数进行加密和解密,可以按照以下步骤操作: 1. 在配置文件的env 中添加`shade.properties`配置,该配置的值是键值对形式(键的类型必须是字符串) ,如下所示: ```hocon env { shade.properties = { suffix = "666" } } ``` 2. 覆写 `ConfigShade` 接口的 `open` 方法,如下所示: ```java public static class ConfigShadeWithProps implements ConfigShade { private String suffix; private String identifier = "withProps"; @Override public void open(Map props) { this.suffix = String.valueOf(props.get("suffix")); } } ``` 3. 在加密和解密方法中使用open 方法中传入的参数,如下所示: ```java @Override public String encrypt(String content) { return content + suffix; } @Override public String decrypt(String content) { return content.substring(0, content.length() - suffix.length()); } ``` ================================================ FILE: docs/zh/introduction/configuration/metalake.md ================================================ # METALAKE 由于Seatunnel在执行任务时,需要将数据库用户名与密码等隐私信息明文写在脚本中,可能会导致信息泄露;并且维护较为困难,数据源信息发生变更时可能需要手动更改。 因此引入了metalake,将数据源的信息存储于Apache Gravitino等metalake中,任务脚本采用`sourceId`和占位符的方法来代替原本的用户名和密码等信息,运行时seatunnel-engine通过http请求从metalake获取信息,根据占位符进行替换。 若要使用metalake,首先要修改**seatunnel-env.sh**中的环境变量: * `METALAKE_ENABLED` * `METALAKE_TYPE` * `METALAKE_URL` 将`METALAKE_ENABLED`设为`true`,`METALAKE_TYPE`当前仅支持设为`gravitino`。 对于Apache Gravitino,`METALAKE_URL`设为 ``` http://host:port/api/metalakes/your_metalake_name/catalogs/ ``` --- ## 使用示例: 用户需要先在Gravitino中创建catalog,如 ```bash curl -L 'http://localhost:8090/api/metalakes/test_metalake/catalogs' -H 'Content-Type: application/json' -H 'Accept: application/vnd.gravitino.v1+json' -d '{ "name": "test_catalog", "type": "relational", "provider": "jdbc-mysql", "comment": "for metalake test", "properties": { "jdbc-driver": "com.mysql.cj.jdbc.Driver", "jdbc-url": "not used", "jdbc-user": "root", "jdbc-password": "Abc!@#135_seatunnel" } }' ``` 这样便在`test_metalake`中创建了一个`test_catalog`(`metalake`需要提前创建) 于是`METALAKE_URL`可以设为 ``` http://localhost:8090/api/metalakes/test_metalake/catalogs/ ``` source可以写为 ``` source { Jdbc { url = "jdbc:mysql://mysql-e2e:3306/seatunnel?useSSL=false&serverTimezone=UTC&allowPublicKeyRetrieval=true" driver = "${jdbc-driver}" connection_check_timeout_sec = 100 sourceId = "test_catalog" user = "${jdbc-user}" password = "${jdbc-password}" query = "select * from source" } } ``` 其中`sourceId`指代catalog的名称,从而其他项可以使用`${}`占位符,运行时会自动替换。注意,在sink中使用时,同样叫`sourceId`;使用占位符时必须以`${`开头,以`}`结尾,每一项最多只能包含一个占位符,占位符以外也可以有内容 ================================================ FILE: docs/zh/introduction/configuration/schema-evolution.md ================================================ # 模式演进 模式演进是指数据表的Schema可以改变,数据同步任务可以自动适应新的表结构的变化而无需其他操作。 ## 已支持的引擎 - Zeta ## 已支持的模式变更事件类型 - `ADD COLUMN` - `DROP COLUMN` - `RENAME COLUMN` - `MODIFY COLUMN` ## 已支持的连接器 ### 源 [Mysql-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/MySQL-CDC.md) [Oracle-CDC](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/source/Oracle-CDC.md) ### 目标 [Jdbc-Mysql](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Jdbc.md) [Jdbc-Oracle](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Jdbc.md) [Jdbc-Postgres](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Jdbc.md) [Jdbc-Dameng](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Jdbc.md) [Jdbc-SqlServer](https://github.com/apache/seatunnel/blob/dev/docs/en/connectors/sink/Jdbc.md) [StarRocks](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/StarRocks.md) [Doris](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Doris.md) [Paimon](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Paimon.md#模式演变) [Elasticsearch](https://github.com/apache/seatunnel/blob/dev/docs/zh/connectors/sink/Elasticsearch.md#模式演变) 注意: * 目前模式演进不支持transform。不同类型数据库(Oracle-CDC -> Jdbc-Mysql)的模式演进目前不支持ddl中列的默认值。 * 当你使用Oracle-CDC时,你不能使用用户名`SYS`或`SYSTEM`来修改表结构,否则ddl事件将被过滤,这可能导致模式演进不起作用; 另外,如果你的表名以`ORA_TEMP_`开头,也会有相同的问题。 * 早期版本的`达梦`数据库不支持将`Varchar`类型字段更改为`Text`类型字段。 ## 启用Schema evolution功能 在CDC源连接器中模式演进默认是关闭的。你需要在CDC连接器中配置`schema-changes.enabled = true`来启用它。 ## 示例 ### Mysql-CDC -> Jdbc-Mysql ``` env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = shop table = mysql_cdc_e2e_sink_table_with_schema_change_exactly_once primary_keys = ["id"] is_exactly_once = true xa_data_source_class_name = "com.mysql.cj.jdbc.MysqlXADataSource" } } ``` ### Oracle-cdc -> Jdbc-Oracle ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" source.reader.close.timeout = 120000 connection.pool.size = 1 schema-changes.enabled = true } } sink { Jdbc { plugin_input = "customers" driver = "oracle.jdbc.driver.OracleDriver" url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user = "dbzuser" password = "dbz" generate_sink_sql = true database = "ORCLCDB" table = "DEBEZIUM.FULL_TYPES_SINK" batch_size = 1 primary_keys = ["ID"] connection.pool.size = 1 } } ``` ### Oracle-cdc -> Jdbc-Mysql ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { # This is a example source plugin **only for test and demonstrate the feature source plugin** Oracle-CDC { plugin_output = "customers" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["ORCLCDB.DEBEZIUM.FULL_TYPES"] url = "jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" source.reader.close.timeout = 120000 connection.pool.size = 1 schema-changes.enabled = true } } sink { jdbc { plugin_input = "customers" url = "jdbc:mysql://oracle-host:3306/oracle_sink" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true # You need to configure both database and table database = oracle_sink table = oracle_cdc_2_mysql_sink_table primary_keys = ["ID"] } } ``` ### Mysql-cdc -> StarRocks ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { StarRocks { nodeUrls = ["starrocks_cdc_e2e:8030"] username = "root" password = "" database = "shop" table = "${table_name}" url = "jdbc:mysql://starrocks_cdc_e2e:9030/shop" max_retries = 3 enable_upsert_delete = true schema_save_mode="RECREATE_SCHEMA" data_save_mode="DROP_DATA" save_mode_create_template = """ CREATE TABLE IF NOT EXISTS shop.`${table_name}` ( ${rowtype_primary_key}, ${rowtype_fields} ) ENGINE=OLAP PRIMARY KEY (${rowtype_primary_key}) DISTRIBUTED BY HASH (${rowtype_primary_key}) PROPERTIES ( "replication_num" = "1", "in_memory" = "false", "enable_persistent_index" = "true", "replicated_storage" = "true", "compression" = "LZ4" ) """ } } ``` ### Mysql-CDC -> Doris ``` env { # You can set engine configuration here parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { Doris { fenodes = "doris_e2e:8030" username = "root" password = "" database = "shop" table = "products" sink.label-prefix = "test-cdc" sink.enable-2pc = "true" sink.enable-delete = "true" doris.config { format = "json" read_json_by_line = "true" } } } ``` ### Mysql-CDC -> Jdbc-Postgres ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:postgresql://postgresql:5432/shop" driver = "org.postgresql.Driver" user = "postgres" password = "postgres" generate_sink_sql = true database = shop table = "public.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ### Mysql-CDC -> Jdbc-Dameng ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:dm://e2e_dmdb:5236" driver = "dm.jdbc.driver.DmDriver" connection_check_timeout_sec = 1000 user = "SYSDBA" password = "SYSDBA" generate_sink_sql = true database = "DAMENG" table = "SYSDBA.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ### Mysql-CDC -> Jdbc-SqlServer ```hocon env { # You can set engine configuration here parallelism = 5 job.mode = "STREAMING" checkpoint.interval = 5000 read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { server-id = 5652-5657 username = "st_user_source" password = "mysqlpw" table-names = ["shop.products"] url = "jdbc:mysql://mysql_cdc_e2e:3306/shop" schema-changes.enabled = true } } sink { jdbc { url = "jdbc:sqlserver://e2e_sqlserver:1433" driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver" user = "sa" password = "paanssy1234$" generate_sink_sql = true database = master table = "dbo.sink_table_with_schema_change" primary_keys = ["id"] # Validate ddl update for sink writer multi replica multi_table_sink_replica = 2 } } ``` ================================================ FILE: docs/zh/introduction/configuration/sink-options-placeholders.md ================================================ # Sink 参数占位符 ## 介绍 SeaTunnel 提供了 Sink 参数占位符自动替换功能,可让您通过占位符获取上游表元数据。 当您需要动态获取上游表元数据(例如多表写入)时,此功能至关重要。 本文档将指导您如何使用这些占位符以及如何有效地利用它们。 ## 支持的引擎 > SeaTunnel Zeta
    > Flink
    > Spark
    ## 占位符变量 占位符主要通过以下表达式实现: - `${database_name}` - 用于获取上游表中的数据库名称 - 也可以通过表达式指定默认值:`${database_name:default_my_db}` - `${schema_name}` - 用于获取上游表中的 schema 名称 - 也可以通过表达式指定默认值:`${schema_name:default_my_schema}` - `${table_name}` - 用于获取上游表中的 table 名称 - 也可以通过表达式指定默认值:`${table_name:default_my_table}` - `${schema_full_name}` - 用于获取上游表中的 schema 全路径名称,包含 database/schema 名称 - `${table_full_name}` - 用于获取上游表中的 table 全路径名称,包含 database/schema/table 名称 - `${primary_key}` - 用于获取上游表中的主键字段名称列表 - `${unique_key}` - 用于获取上游表中的唯一键字段名称列表 - `${field_names}` - 用于获取上游表中的所有字段名称列表 - `${comment}` - 用于获取上游表中的表注释 - `${partition_keys}` - 用于获取上游表中的分区字段列表 ## 配置 *先决条件*: - 确认 Sink 连接器已经支持了 `TableSinkFactory` API ### 配置示例 1 ```hocon env { // ignore... } source { MySQL-CDC { // ignore... } } transform { // ignore... } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "${database_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` ### 配置示例 2 ```hocon env { // ignore... } source { Oracle-CDC { // ignore... } } transform { // ignore... } sink { jdbc { url = "jdbc:mysql://localhost:3306" driver = "com.mysql.cj.jdbc.Driver" user = "root" password = "123456" database = "${schema_name}_test" table = "${table_name}_test" primary_keys = ["${primary_key}"] } } ``` 占位符的替换将在连接器启动之前完成,确保 Sink 参数在使用前已准备就绪。 若该占位符变量没有被替换,则可能是上游表元数据缺少该选项,例如: - `mysql` source 连接器不包含 `${schema_name}` 元数据 - `oracle` source 连接器不包含 `${database_name}` 元数据 - ... ================================================ FILE: docs/zh/introduction/configuration/speed-limit.md ================================================ # 速度控制 ## 介绍 SeaTunnel提供了强大的速度控制功能允许你管理数据同步的速率。当你需要确保在系统之间数据传输的高效和可控这个功能是至关重要的。 速度控制主要由两个关键参数控制:`read_limit.rows_per_second` 和 `read_limit.bytes_per_second`。 本文档将指导您如何使用这些参数以及如何有效地利用它们。 ## 支持这些引擎 > SeaTunnel Zeta
    > Flink
    > Spark
    ## 配置 要使用速度控制功能,你需要在job配置中设置`read_limit.rows_per_second` 或 `read_limit.bytes_per_second`参数。 配置文件中env配置示例: ```hocon env { job.mode=STREAMING job.name=SeaTunnel_Job read_limit.bytes_per_second=7000000 read_limit.rows_per_second=400 } source { MySQL-CDC { // ignore... } } transform { } sink { Console { } } ``` 我们在`env`参数中放了`read_limit.bytes_per_second` 和 `read_limit.rows_per_second`来完成速度控制的配置。 你可以同时配置这两个参数,或者只配置其中一个。每个`value`的值代表每个线程被限制的最大速率。 因此,在配置各个值时,还需要同时考虑你任务的并行性。 ================================================ FILE: docs/zh/introduction/configuration/sql-config.md ================================================ # SQL配置文件 在编写`SQL`配置文件之前,请确保配置文件的名称应该以`.sql`结尾。 ## SQL配置文件结构 `SQL`配置文件类似下面这样: ### SQL ```sql /* config env { parallelism = 1 job.mode = "BATCH" } */ CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type'='source', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'query' = 'select * from source', 'properties'= '{ useSSL = false, rewriteBatchedStatements = true }' ); CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type'='sink', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'seatunnel', 'table' = 'sink' ); INSERT INTO sink_table SELECT id, name, age, email FROM source_table; ``` ## `SQL`配置文件说明 ### 通用配置 ```sql /* config env { parallelism = 1 job.mode = "BATCH" } */ ``` 在`SQL`文件中通过 `/* config */` 注释定义通用配置部分,内部可以使用`hocon`格式定义通用的配置,如`env`等。 ### SOURCE SQL语法 ```sql CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type'='source', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'query' = 'select * from source', 'properties' = '{ useSSL = false, rewriteBatchedStatements = true }' ); ``` * 使用 `CREATE TABLE ... WITH (...)` 语法可创建源端表映射, `TABLE`表名为源端映射的表名,`WITH`语法中为源端相关的配置参数 * 在WITH语法中有两个固定参数:`connector` 和 `type`,分别表示连接器插件名(如:`jdbc`、`FakeSource`等)和源端类型(固定为:`source`) * 其它参数名可以参考对应连接器插件的相关配置参数,但是格式需要改为`'key' = 'value',`的形式 * 如果`'value'`为一个子配置,可以直接使用`hocon`格式的字符串,注意:如果使用`hocon`格式的子配置,内部的属性项之间必须用`,`分隔!如: ```sql 'properties' = '{ useSSL = false, rewriteBatchedStatements = true }' ``` * 如果在`'value'`中使用到`'`,需要用`''`进行转义,如: ```sql 'query' = 'select * from source where name = ''Joy Ding''' ``` ### SINK SQL语法 ```sql CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type'='sink', 'url' = 'jdbc:mysql://localhost:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'seatunnel', 'table' = 'sink' ); ``` * 使用 `CREATE TABLE ... WITH (...)` 语法可创建目标端表映射, `TABLE`表名为目标端映射的表名,`WITH`语法中为目标端相关的配置参数 * 在WITH语法中有两个固定参数:`connector` 和 `type`,分别表示连接器插件名(如:`jdbc`、`console`等)和目标端类型(固定为:`sink`) * 其它参数名可以参考对应连接器插件的相关配置参数,但是格式需要改为`'key' = 'value',`的形式 ### INSERT INTO SELECT语法 ```sql INSERT INTO sink_table SELECT id, name, age, email FROM source_table; ``` * `SELECT FROM` 部分为源端映射表的表名,`SELECT` 部分的语法参考:[SQL-transform](../../transforms/sql.md) `query` 配置项。如果select的字段是关键字([参考](https://github.com/JSQLParser/JSqlParser/blob/master/src/main/jjtree/net/sf/jsqlparser/parser/JSqlParserCC.jjt)),你应该像这样使用\`fieldName\` ```sql INSERT INTO sink_table SELECT id, name, age, email,`output` FROM source_table; ``` * `INSERT INTO` 部分为目标端映射表的表名 * 注意:该语法**不支持**在 `INSERT` 中指定字段,如:`INSERT INTO sink_table (id, name, age, email) SELECT id, name, age, email FROM source_table;` ### INSERT INTO SELECT TABLE语法 ```sql INSERT INTO sink_table SELECT source_table; ``` * `SELECT` 部分直接使用源端映射表的表名,表示将源端表的所有数据插入到目标端表中 * 使用该语法不会生成`trasform`的相关配置,这种语法一般用在多表同步的场景,示例: ```sql CREATE TABLE source_table WITH ( 'connector'='jdbc', 'type' = 'source', 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'table_list' = '[ { table_path = "source.table1" }, { table_path = "source.table2", query = "select * from source.table2" } ]' ); CREATE TABLE sink_table WITH ( 'connector'='jdbc', 'type' = 'sink', 'url' = 'jdbc:mysql://127.0.0.1:3306/seatunnel', 'driver' = 'com.mysql.cj.jdbc.Driver', 'user' = 'root', 'password' = '123456', 'generate_sink_sql' = 'true', 'database' = 'sink' ); INSERT INTO sink_table SELECT source_table; ``` ### CREATE TABLE AS语法 ```sql CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; ``` * 该语法可以将一个`SELECT`查询结果作为一个临时表,用于的`INSERT INTO`操作 * `SELECT` 部分的语法参考:[SQL Transform](../transforms/sql.md) `query` 配置项 ```sql CREATE TABLE temp1 AS SELECT id, name, age, email FROM source_table; INSERT INTO sink_table SELECT * FROM temp1; ``` ## SQL配置文件任务提交示例 ```bash ./bin/seatunnel.sh --config ./config/sample.sql ``` ================================================ FILE: docs/zh/introduction/how-it-works.md ================================================ --- sidebar_position: 2 --- # 工作原理 ## 概述 SeaTunnel 是一个分布式多模态数据集成工具,采用插件化架构。连接器层与执行引擎解耦,同一套连接器可在不同引擎上运行。 ``` ┌─────────────────────────────────────────────────────────────┐ │ 作业配置 │ │ (HOCON / SQL / Web UI) │ └─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ │ SeaTunnel 核心层 │ │ (作业解析器、协调器、调度器) │ └─────────────────────────────────────────────────────────────┘ │ ┌─────────────────────┼─────────────────────┐ ▼ ▼ ▼ ┌───────────────┐ ┌───────────────┐ ┌───────────────┐ │ Source │────▶│ Transform │────▶│ Sink │ │ 数据源连接器 │ │ (可选) │ │ 目标连接器 │ └───────────────┘ └───────────────┘ └───────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ │ 执行引擎 │ │ SeaTunnel Engine (Zeta) / Flink / Spark │ └─────────────────────────────────────────────────────────────┘ ``` ## 核心组件 ### 1. Connector API 与引擎无关的统一 API,用于开发 Source、Transform、Sink 连接器。 | 组件 | 说明 | |------|------| | **Source** | 从外部系统读取数据(数据库、文件、消息队列) | | **Transform** | 数据转换(字段映射、过滤、类型转换) | | **Sink** | 将数据写入目标系统 | ### 2. 执行引擎 | 引擎 | 适用场景 | |------|---------| | **SeaTunnel Engine (Zeta)** | 数据同步、CDC、低资源消耗 | | **Apache Flink** | 复杂流处理、已有 Flink 基础设施 | | **Apache Spark** | 大规模批处理、已有 Spark 基础设施 | ### 3. 翻译层 将 SeaTunnel 统一 API 转换为引擎特定实现,实现连接器跨引擎复用。 ## 数据流 ``` Source ──▶ [分片] ──▶ Reader ──▶ Transform ──▶ Writer ──▶ Sink │ │ │ │ ▼ │ │ Checkpoint/状态 │ │ │ │ └──────────────────────┴────────────────────────┘ 容错机制 ``` **核心特性:** - 基于分片的并行读取 - 分布式快照实现精确一次语义 - 自动故障转移和恢复 ## 模块结构 ``` seatunnel/ ├── seatunnel-api/ # 核心 API 定义 ├── seatunnel-connectors-v2/ # Source & Sink 连接器 ├── seatunnel-transforms-v2/ # Transform 插件 ├── seatunnel-engine/ # SeaTunnel Engine (Zeta) ├── seatunnel-translation/ # 引擎适配器 (Flink/Spark) ├── seatunnel-core/ # 作业提交 & CLI ├── seatunnel-formats/ # 数据格式处理 └── seatunnel-e2e/ # 端到端测试 ``` ## 作业执行流程 1. **解析** - 读取并验证作业配置 2. **规划** - 生成带并行度的执行计划 3. **调度** - 将任务分发到 Worker 节点 4. **执行** - 运行 Source → Transform → Sink 管道 5. **监控** - 跟踪进度、指标和检查点 ## 下一步 - [引擎对比](../engines/overview.md) - [快速开始](../getting-started/locally/quick-start-seatunnel-engine.md) - [连接器列表](../connectors/overview.md) ================================================ FILE: docs/zh/tools/overview.md ================================================ --- sidebar_position: 1 --- # SeaTunnel 工具集概览 Apache SeaTunnel 工具集是一组面向开发者和运维人员的辅助工具,涵盖 LLM 集成、配置转换和 AI 辅助等功能。 ## 可用工具 | 工具 | 用途 | 状态 | |------|------|------| | [SeaTunnel Skill](seatunnel-skill) | Claude AI 集成,辅助 SeaTunnel 操作 | 可用 | | [SeaTunnel MCP 服务](seatunnel-mcp) | 面向 LLM 的模型上下文协议服务 | 可用 | | [x2seatunnel](x2seatunnel) | 配置转换工具(DataX → SeaTunnel) | 可用 | ## 源码仓库 所有工具均维护于 [SeaTunnel Tools](https://github.com/apache/seatunnel-tools) 仓库中。 ================================================ FILE: docs/zh/tools/seatunnel-mcp.md ================================================ --- sidebar_position: 3 --- # SeaTunnel MCP 服务 SeaTunnel MCP 服务实现了[模型上下文协议(Model Context Protocol)](https://modelcontextprotocol.io/),使 LLM 系统能够与 SeaTunnel 资源进行交互。 ## 概述 MCP 服务将 SeaTunnel 文档、连接器元数据和任务管理能力以 MCP 资源与工具的形式对外暴露,允许任意支持 MCP 协议的 LLM 客户端辅助完成 SeaTunnel 操作。 ## 快速开始 安装与配置说明请参阅 [SeaTunnel Tools 仓库](https://github.com/apache/seatunnel-tools/tree/main/seatunnel-mcp)。 ================================================ FILE: docs/zh/tools/seatunnel-skill.md ================================================ --- sidebar_position: 2 --- # SeaTunnel Skill SeaTunnel Skill 是 Claude Code 的 AI 集成技能,为 SeaTunnel 的操作、配置和故障排查提供即时帮助。 ## 功能特性 - **AI 助手**:即时获取 SeaTunnel 概念和配置相关帮助 - **知识集成**:查询官方文档和最佳实践 - **智能调试**:分析错误并给出修复建议 - **代码示例**:为您的用例自动生成配置示例 ## 安装 ```bash # 克隆仓库 git clone https://github.com/apache/seatunnel-tools.git cd seatunnel-tools # 复制技能文件到 Claude Code 技能目录 cp -r seatunnel-skill ~/.claude/skills/ ``` ## 使用方法 安装完成后,在 Claude Code 中使用: ```bash # 查询 SeaTunnel 文档 /seatunnel-skill "如何配置 MySQL 到 PostgreSQL 的数据同步?" # 获取连接器信息 /seatunnel-skill "列出所有可用的 Kafka 连接器选项" # 调试配置问题 /seatunnel-skill "为什么我的任务出现 OutOfMemoryError 错误?" # 生成配置示例 /seatunnel-skill "创建一个 MySQL 到 Elasticsearch 的任务配置" ``` ## 系统要求 - 已安装 [Claude Code](https://claude.ai/code) - Claude Code 技能目录位于 `~/.claude/skills/` ================================================ FILE: docs/zh/tools/x2seatunnel.md ================================================ --- sidebar_position: 4 --- # x2seatunnel x2seatunnel 是一款配置转换工具,可将 DataX 等数据集成工具的配置文件转换为 SeaTunnel 格式。 ## 支持的转换 | 源格式 | 目标格式 | |--------|---------| | DataX JSON | SeaTunnel HOCON | ## 快速开始 安装与使用说明请参阅 [x2seatunnel 仓库](https://github.com/apache/seatunnel-tools/tree/main/x2seatunnel)。 ================================================ FILE: docs/zh/transforms/common-options/common-options.md ================================================ # 转换常见选项 > 源端连接器的常见参数 :::caution 警告 旧的配置名称 `result_table_name`/`source_table_name` 已经过时,请尽快迁移到新名称 `plugin_output`/`plugin_input`。 ::: | 参数名称 | 参数类型 | 是否必须 | 默认值 | |---------------|--------|------|-----| | plugin_output | string | no | - | | plugin_input | string | no | - | ### plugin_input [string] 当未指定 `plugin_input` 时,当前插件在配置文件中处理由前一个插件输出的数据集 `(dataset)` ; 当指定了 `plugin_input` 时,当前插件正在处理与该参数对应的数据集 ### plugin_output [string] 当未指定 `plugin_output` 时,此插件处理的数据不会被注册为其他插件可以直接访问的数据集,也不会被称为临时表 `(table)`; 当指定了 `plugin_output` 时,此插件处理的数据将被注册为其他插件可以直接访问的数据集 `(dataset)`,或者被称为临时表 `(table)`。在这里注册的数据集可以通过指定 `plugin_input` 被其他插件直接访问。 ## 示例 ================================================ FILE: docs/zh/transforms/copy.md ================================================ # 复制 > 复制转换插件 ## 描述 将字段复制到一个新字段。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |--------|--------|------|-----| | fields | Object | yes | | ### fields [config] 指定输入和输出之间的字段复制关系 ### 常见选项 [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情。 ## 示例 从源读取的数据是这样的一个表: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | 想要将字段 `name`、`age` 复制到新的字段 `name1`、`name2`、`age1`,我们可以像这样添加 `Copy` 转换: ``` transform { Copy { plugin_input = "fake" plugin_output = "fake1" fields { name1 = name name2 = name age1 = age } } } ``` 那么结果表 `fake1` 中的数据将会像这样: | name | age | card | name1 | name2 | age1 | |----------|-----|------|----------|----------|------| | Joy Ding | 20 | 123 | Joy Ding | Joy Ding | 20 | | May Ding | 20 | 123 | May Ding | May Ding | 20 | | Kin Dom | 20 | 123 | Kin Dom | Kin Dom | 20 | | Joy Dom | 20 | 123 | Joy Dom | Joy Dom | 20 | ## 更新日志 ### 新版本 - 添加复制转换连接器 - 支持将字段复制到新字段 ================================================ FILE: docs/zh/transforms/data-validator.md ================================================ # DataValidator > 数据验证转换插件 ## 描述 DataValidator 转换插件根据配置的规则验证字段值,并基于指定的错误处理策略处理验证失败的情况。它支持多种验证规则类型,包括空值检查、范围验证、长度验证和正则表达式模式匹配。 ## 选项 | 名称 | 类型 | 是否必需 | 默认值 | |-----------------|--------|----------|--------| | error_handle_way| enum | 否 | FAIL | | row_error_handle_way.error_table | string | 否 | | | field_rules | array | 是 | | ### row_error_handle_way [enum] 验证失败时的错误处理策略: - `FAIL`: 当验证错误发生时,整个任务失败 - `SKIP`: 跳过无效行并继续处理 - `ROUTE_TO_TABLE`: 将无效数据路由到指定的错误表 **注意**: `ROUTE_TO_TABLE` 模式仅适用于支持多表的 sink 连接器。sink 必须具备处理路由到不同表目标的数据的能力。 ### row_error_handle_way.error_table [string] 当 `row_error_handle_way` 设置为 `ROUTE_TO_TABLE` 时,用于路由无效数据的目标表名。使用 `ROUTE_TO_TABLE` 模式时此参数为必需。 #### 错误表Schema 当使用 `ROUTE_TO_TABLE` 模式时,DataValidator会自动创建一个具有固定schema的错误表来存储验证失败的数据。错误表包含以下字段: | 字段名 | 数据类型 | 描述 | |--------|----------|------| | source_table_id | STRING | 源表标识符,标识数据来源的表 | | source_table_path | STRING | 源表路径,完整的表路径信息 | | original_data | STRING | 原始数据的JSON表示,包含验证失败的完整行数据 | | validation_errors | STRING | 验证错误详情的JSON数组,包含所有验证失败的字段和错误信息 | | create_time | TIMESTAMP | 验证错误的创建时间 | **完整错误表记录示例**: ```json { "source_table_id": "users_table", "source_table_path": "database.users", "original_data": "{\"id\": 123, \"name\": null, \"age\": 200, \"email\": \"invalid-email\"}", "validation_errors": "[{\"field_name\": \"name\", \"error_message\": \"Field 'name' cannot be null\"}, {\"field_name\": \"age\", \"error_message\": \"Field 'age' value 200 is not within range [0, 150]\"}, {\"field_name\": \"email\", \"error_message\": \"Field 'email' does not match pattern '^[\\\\w-\\\\.]+@([\\\\w-]+\\\\.)+[\\\\w-]{2,4}$'\"}]", "create_time": "2024-01-15T10:30:45" } ``` **数据路由机制**: - 验证通过的数据会保持原始schema并路由到主输出表 - 验证失败的数据会被转换为上述错误表schema格式并路由到指定的错误表 - 每个验证失败的行都会在错误表中生成一条记录,包含完整的原始数据和详细的错误信息 ### field_rules [array] 字段验证规则数组。每个规则定义特定字段的验证条件。 #### 字段规则结构 每个字段规则包含: - `field_name`: 要验证的字段名称 - `rules`: 要应用的验证规则数组(嵌套格式),或单独的规则属性(扁平格式) #### 验证规则类型 ##### NOT_NULL 验证字段值不为空。 参数: - `rule_type`: "NOT_NULL" - `custom_message` (可选): 自定义错误消息 ##### RANGE 验证数值在指定范围内。 参数: - `rule_type`: "RANGE" - `min_value` (可选): 最小允许值 - `max_value` (可选): 最大允许值 - `min_inclusive` (可选): 最小值是否包含在内(默认: true) - `max_inclusive` (可选): 最大值是否包含在内(默认: true) - `custom_message` (可选): 自定义错误消息 ##### LENGTH 验证字符串、数组或集合值的长度。 参数: - `rule_type`: "LENGTH" - `min_length` (可选): 最小允许长度 - `max_length` (可选): 最大允许长度 - `exact_length` (可选): 精确要求的长度 - `custom_message` (可选): 自定义错误消息 ##### REGEX 验证字符串值匹配正则表达式模式。 参数: - `rule_type`: "REGEX" - `pattern`: 正则表达式模式(必需) - `case_sensitive` (可选): 模式匹配是否区分大小写(默认: true) - `custom_message` (可选): 自定义错误消息 ##### UDF (用户自定义函数) 使用自定义业务逻辑实现的用户自定义函数验证字段值。 参数: - `rule_type`: "UDF" - `function_name`: 要执行的UDF函数名称(必需) - `custom_message` (可选): 自定义错误消息 **内置UDF函数:** - `EMAIL`: 基于OWASP建议使用实用验证规则验证电子邮件地址 **创建自定义UDF函数:** 要创建自定义UDF函数: 1. 实现 `DataValidatorUDF` 接口 2. 使用 `@AutoService(DataValidatorUDF.class)` 注解 3. 提供唯一的 `functionName()` 4. 实现包含自定义逻辑的 `validate()` 方法 ### 通用选项 [string] 转换插件通用参数,请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 ### 示例 1: 使用 FAIL 模式的基本验证 ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "age" rule_type = "RANGE" min_value = 0 max_value = 150 }, { field_name = "email" rule_type = "REGEX" pattern = "^[\\w-\\.]+@([\\w-]+\\.)+[\\w-]{2,4}$" } ] } } ``` ### 示例 2: 使用 SKIP 模式的验证 ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "SKIP" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "name" rule_type = "LENGTH" min_length = 2 max_length = 50 } ] } } ``` ### 示例 3: 使用 ROUTE_TO_TABLE 模式的验证 ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "ROUTE_TO_TABLE" row_error_handle_way.error_table = "error_data" field_rules = [ { field_name = "name" rule_type = "NOT_NULL" }, { field_name = "age" rule_type = "RANGE" min_value = 0 max_value = 150 } ] } } ``` **注意**: 使用 `ROUTE_TO_TABLE` 时,请确保您的 sink 连接器支持多表。有效数据将发送到主输出表,而无效数据将路由到指定的错误表。 在此示例中: - 验证通过的数据将保持原始schema(包含name、age等字段)并发送到主输出表 - 验证失败的数据将被转换为错误表schema(包含source_table_id、source_table_path、original_data、validation_errors、create_time字段)并路由到"error_data"表 ### 示例 4: 嵌套规则格式 ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "name" rules = [ { rule_type = "NOT_NULL" custom_message = "姓名是必需的" }, { rule_type = "LENGTH" min_length = 2 max_length = 50 custom_message = "姓名长度必须在2到50个字符之间" } ] } ] } } ``` ### 示例 5: 使用内置UDF进行邮箱验证 ```hocon transform { DataValidator { plugin_input = "source_table" plugin_output = "validated_table" row_error_handle_way = "FAIL" field_rules = [ { field_name = "email" rule_type = "UDF" function_name = "EMAIL" custom_message = "邮箱地址格式无效" } ] } } ``` ## UDF开发指南 ### 创建自定义UDF函数 要创建自定义验证UDF函数,请按照以下步骤: #### 1. 实现DataValidatorUDF接口 ```java package com.example.validator; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.transform.validator.ValidationContext; import org.apache.seatunnel.transform.validator.ValidationResult; import org.apache.seatunnel.transform.validator.udf.DataValidatorUDF; import com.google.auto.service.AutoService; @AutoService(DataValidatorUDF.class) public class PhoneValidator implements DataValidatorUDF { @Override public String functionName() { return "PHONE_VALIDATOR"; } @Override public ValidationResult validate( Object value, SeaTunnelDataType dataType, ValidationContext context) { if (value == null) { return ValidationResult.success(); } String phone = value.toString().trim(); // 自定义手机号验证逻辑 if (phone.matches("^\\+?[1-9]\\d{1,14}$")) { return ValidationResult.success(); } else { return ValidationResult.failure("手机号码格式无效: " + phone); } } @Override public String getDescription() { return "验证国际手机号码格式"; } } ``` #### 2. 注册UDF UDF通过 `@AutoService(DataValidatorUDF.class)` 注解自动注册。这使用Java的ServiceLoader机制在运行时发现和加载UDF实现。 #### 3. 打包和部署 1. 编译您的UDF类并将其打包到JAR文件中 2. 将JAR文件放置在SeaTunnel类路径中 3. UDF将被自动发现并可供使用 **使用示例**: ```hocon { field_name = "email" rule_type = "UDF" function_name = "EMAIL" custom_message = "请提供有效的邮箱地址" } ``` ================================================ FILE: docs/zh/transforms/define-sink-type.md ================================================ # Define Sink Type > Define sink type transform plugin ## Description 用于定义 sink 字段存储类型,对于 savemode 开启自动建表时有效 ## Options | name | type | required | default value | Description | |:-------:|---------------------------|----------|---------------|--------------------| | columns | list> | yes | | 需要定义的列,必须设置列的名称和类型 | ## Examples ### 指定部分字段的建表类型 ``` transform { DefineSinkType { columns = [ { column = "c1" type = "nvarchar2(10)" } { column = "c2" type = "datetime(6)" } { column = "c3" type = "your target type" } ] } } ``` ================================================ FILE: docs/zh/transforms/dynamic-compile.md ================================================ # DynamicCompile > 动态编译插件 ## 描述 :::tip 特别申明 您需要确保服务的安全性,并防止攻击者上传破坏性代码 ::: 提供一种可编程的方式来处理行,允许用户自定义任何业务行为,甚至基于现有行字段作为参数的RPC请求,或者通过从其他数据源检索相关数据来扩展字段。为了区分业务,您还可以定义多个转换进行组合, 如果转换过于复杂,可能会影响性能 ## 属性 | name | type | required | default value | |------------------|--------|----------|---------------| | source_code | string | no | | | compile_language | Enum | yes | | | compile_pattern | Enum | no | SOURCE_CODE | | absolute_path | string | no | | ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情。 ### compile_language [Enum] Java中的某些语法可能不受支持,请参阅https://github.com/janino-compiler/janino GROOVY,JAVA,SCALA(目前支持 Zeta) ### compile_pattern [Enum] SOURCE_CODE,ABSOLUTE_PATH 选择 SOURCE_CODE,SOURCE_CODE 属性必填;选择ABSOLUTE_PATH,ABSOLUTE_PATH属性必填。 ### absolute_path [string] 服务器上Java或Groovy文件的绝对路径 ### source_code [string] 源代码 #### 关于source_code 在代码中,你必须实现两个方法 - `Column[] getInlineOutputColumns(CatalogTable inputCatalogTable)` - `Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow)` `getInlineOutputColumns`方法中,入参类型为`CatalogTable`,返回结果为`Column[]`。 你可以从入参的`CatalogTable`获取当前表的表结构。 在返回结果中,如果字段已经存在,则会根据返回结果进行覆盖,如果不存在,则会添加到现有表结构中。 `getInlineOutputFieldValues`方法,入参类型为`SeaTunnelRowAccessor`,返回结果为`Object[]` 你可以从`SeaTunnelRowAccessor`获取到当前行的数据,进行自己的定制化数据处理逻辑。 返回结果中,数组长度需要与`getInlineOutputColumns`方法返回的长度一致,并且里面的字段值顺序也需要保持一致。 如果有第三方依赖包,请将它们放在${SEATUNNEL_HOME}/lib中,如果您使用spark或flink,则需要将其放在相应服务的libs下。 你需要重启集群服务,才能重新加载这些依赖。 ## Example 源端数据读取的表格如下: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 30 | 123 | | Joy Dom | 30 | 123 | 我们将使用`DynamicCompile`对数据进行修改,添加一列`compile_language`字段,并且将`age`字段更新,当`age=20`时将其更新为`40` - 使用groovy ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "groovy_out" compile_language="GROOVY" compile_pattern="SOURCE_CODE" source_code=""" import org.apache.seatunnel.api.table.catalog.Column import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor import org.apache.seatunnel.api.table.catalog.CatalogTable import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.type.*; import java.util.ArrayList; class demo { public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { PhysicalColumn col1 = PhysicalColumn.of( "compile_language", BasicType.STRING_TYPE, 10L, true, "", ""); PhysicalColumn col2 = PhysicalColumn.of( "age", BasicType.INT_TYPE, 0L, false, false, "" ); return new Column[]{ col1, col2 }; } public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { Object[] fieldValues = new Object[2]; // get age Object ageField = inputRow.getField(1); fieldValues[0] = "GROOVY"; if (Integer.parseInt(ageField.toString()) == 20) { fieldValues[1] = 40; } else { fieldValues[1] = ageField; } return fieldValues; } };""" } } ``` - 使用java ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "java_out" compile_language="JAVA" compile_pattern="SOURCE_CODE" source_code=""" import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.type.SeaTunnelRowAccessor; import org.apache.seatunnel.api.table.catalog.*; import org.apache.seatunnel.api.table.type.*; import java.util.ArrayList; public Column[] getInlineOutputColumns(CatalogTable inputCatalogTable) { PhysicalColumn col1 = PhysicalColumn.of( "compile_language", BasicType.STRING_TYPE, 10L, true, "", ""); PhysicalColumn col2 = PhysicalColumn.of( "age", BasicType.INT_TYPE, 0L, false, false, "" ); return new Column[]{ col1, col2 }; } public Object[] getInlineOutputFieldValues(SeaTunnelRowAccessor inputRow) { Object[] fieldValues = new Object[2]; // get age Object ageField = inputRow.getField(1); fieldValues[0] = "JAVA"; if (Integer.parseInt(ageField.toString()) == 20) { fieldValues[1] = 40; } else { fieldValues[1] = ageField; } return fieldValues; } """ } } ``` - 指定源码文件路径 ```hacon transform { DynamicCompile { plugin_input = "fake" plugin_output = "groovy_out" compile_language="GROOVY" compile_pattern="ABSOLUTE_PATH" absolute_path="""/tmp/GroovyFile""" } } ``` 那么结果表 `groovy_out` 中的数据将会更新为: | name | age | card | compile_language | |----------|-----|------|------------------| | Joy Ding | 40 | 123 | GROOVY | | May Ding | 40 | 123 | GROOVY | | Kin Dom | 30 | 123 | GROOVY | | Joy Dom | 30 | 123 | GROOVY | 那么结果表 `java_out` 中的数据将会更新为: | name | age | card | compile_language | |----------|-----|------|------------------| | Joy Ding | 40 | 123 | JAVA | | May Ding | 40 | 123 | JAVA | | Kin Dom | 30 | 123 | JAVA | | Joy Dom | 30 | 123 | JAVA | 更多复杂例子可以参考 https://github.com/apache/seatunnel/tree/dev/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/dynamic_compile/conf ## Changelog ================================================ FILE: docs/zh/transforms/embedding.md ================================================ # Embedding > Embedding Transform Plugin ## 描述 `Embedding` 转换插件利用 embedding 模型将文本和多模态数据转换为向量化表示。此转换可以应用于各种字段,包括文本、图片和视频。该插件支持多种模型提供商,并且可以与不同的API集成。 > **重要提示:** 当前 embedding 精确度仅支持 float32 ## 配置选项 | 名称 | 类型 | 是否必填 | 默认值 | 描述 | |--------------------------------|--------|------|--------|------------------------------------------------------------------| | model_provider | enum | 是 | - | embedding模型的提供商。可选项包括 `AMAZON`、`QIANFAN`、`OPENAI` 等。 | | api_key | string | 是 | - | 用于验证embedding服务的API密钥。 | | secret_key | string | 是 | - | 用于额外验证的密钥。一些提供商可能需要此密钥进行安全的API请求。 | | aws_region | string | 否 | | 用于使用Amazon Bedrock 模型,需要指定模型请求区域. | | single_vectorized_input_number | int | 否 | 1 | 单次请求向量化的输入数量。默认值为1。 | | vectorization_fields | map | 是 | - | 输入字段和相应的输出向量字段之间的映射。 | | model | string | 是 | - | 要使用的具体embedding模型。例如,如果提供商为OPENAI,可以指定 `text-embedding-3-small`。 | | api_path | string | 否 | - | embedding服务的API。通常由模型提供商提供。 | | dimension | int | 否 | 2048 | 向量维度默认为 2048,Embedding-3模型支持自定义向量维度,建议选择256、512、1024或2048维度。 | | oauth_path | string | 否 | - | oauth 服务的 API 。 | | custom_config | map | 否 | | 模型的自定义配置。 | | custom_response_parse | string | 否 | | 使用 JsonPath 解析模型响应的方式。示例:`$.choices[*].message.content`。 | | custom_request_headers | map | 否 | | 发送到模型的请求的自定义头信息。 | | custom_request_body | map | 否 | | 请求体的自定义配置。支持占位符如 `${model}`、`${input}`。 | ## 精度支持 **重要:** 当前版本的 Embedding 插件仅支持 **float32** 精度的向量数据。 - 所有生成的 embedding 向量将以 float32 格式存储 - 如果您的模型或API返回其他精度格式(如 float64),插件会自动转换为 float32 ### model_provider 用于生成 embedding 的模型提供商。常见选项包括 `AMAZON`、 `DOUBAO`、`QIANFAN`、`OPENAI` 等,同时可选择 `CUSTOM` 实现自定义 embedding 模型的请求以及获取。 ### api_key 用于验证 embedding 服务请求的API密钥。通常由模型提供商在你注册他们的服务时提供,对于使用`AMAZON` 模型则对应IAM access key。 ### secret_key 用于额外验证的密钥。一些提供商可能要求此密钥以确保API请求的安全性。 ### single_vectorized_input_number 指定单次请求向量化的输入数量。默认值为1。根据处理能力和模型提供商的API限制进行调整。 ### vectorization_fields 输入字段和相应的输出向量字段之间的映射。这使得插件可以理解要向量化的字段以及如何存储生成的向量。插件通过允许您为每个字段指定模态类型来支持多模态数据。 **基本文本向量化:** ```hocon vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } ``` **多模态向量化:** ```hocon vectorization_fields { # 基本文本字段 text_vector = text_field # 显式指定模态类型的配置 product_image_vector = { field = product_image_url modality = jpeg format = url } # 自动检测模态类型(根据文件后缀) thumbnail_vector = { field = thumbnail_image # 如果值为 "image.png",会自动检测为 PNG 模态 format = url } # 视频字段配置 demo_video_vector = { field = product_video_url modality = mp4 format = url } # 二进制数据配置 binary_image_vector = { field = image_data modality = jpeg format = binary } } ``` **字段规范格式:** **支持的模态类型:** - **图片:** `jpeg` (jpg, jpeg), `png` (png, apng), `gif`, `webp`, `bmp` (bmp, dib), `tiff` (tiff, tif), `ico`, `icns`, `sgi`, `jpeg2000` (j2c, j2k, jp2, jpc, jpf, jpx) - **视频:** `mp4`, `avi`, `mov` - **文本:** `text`(默认) **数据格式:** - `text` - 文本格式(默认) - `url` - URL 格式 - `binary` - 二进制数据格式 **自动模态检测:** 当未显式指定 `modality` 且 `format` 不是 `binary` 时,系统会根据字段值的文件后缀自动检测模态类型: > **重要:** 使用多模态字段(图片或视频)时,请确保您的模型提供商支持多模态 embedding。图片和视频字段必须包含有效的 URL 或二进制数据。目前,`DOUBAO` 提供商支持多模态数据处理。 ### model 要使用的具体 embedding 模型。这取决于`model_provider`。例如,如果使用 OPENAI ,可以指定 `text-embedding-3-small`。 ### api_path 用于向 embedding 服务发送请求的API。根据提供商和所用模型的不同可能有所变化。通常由模型提供商提供。 ### oauth_path 用于向oauth服务发送请求的API,获取对应的认证信息。根据提供商和所用模型的不同可能有所变化。通常由模型提供商提供。 ### custom_config `custom_config` 选项允许您为模型提供额外的自定义配置。这是一个映射,您可以在其中定义特定模型可能需要的各种设置。 ### custom_response_parse `custom_response_parse` 选项允许您指定如何解析模型的响应。您可以使用 JsonPath 从响应中提取所需的特定数据。例如,使用 `$.data[*].embedding` 提取如下json中的 `embedding` 字段 值,获取 `List` 嵌套 `List` 的结果。JsonPath 的使用请参考 [JsonPath 快速入门](https://github.com/json-path/JsonPath?tab=readme-ov-file#getting-started) ```json { "object": "list", "data": [ { "object": "embedding", "index": 0, "embedding": [ -0.006929283495992422, -0.005336422007530928, -0.00004547132266452536, -0.024047505110502243 ] } ], "model": "text-embedding-3-small", "usage": { "prompt_tokens": 5, "total_tokens": 5 } } ``` ### custom_request_headers `custom_request_headers` 选项允许您定义应包含在发送到模型 API 的请求中的自定义头信息。如果 API 需要标准头信息之外的额外头信息,例如授权令牌、内容类型等,这个选项会非常有用。 ### custom_request_body `custom_request_body` 选项支持占位符: - `${model}`:用于模型名称的占位符。 - `${input}`:用于确定输入值的占位符,同时根据 body value 的类型定义请求体请求类型。例如:`["${input}"]` -> ["input"] ( list)。 ### common options 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例配置 ### 基本文本 Embedding ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { book_id = "int" book_name = "string" book_intro = "string" author_biography = "string" } } rows = [ {fields = [1, "To Kill a Mockingbird", "Set in the American South during the 1930s, To Kill a Mockingbird tells the story of young Scout Finch and her brother, Jem, who are growing up in a world of racial inequality and injustice. Their father, Atticus Finch, is a lawyer who defends a black man falsely accused of raping a white woman, teaching his children valuable lessons about morality, courage, and empathy.", "Harper Lee (1926–2016) was an American novelist best known for To Kill a Mockingbird, which won the Pulitzer Prize in 1961. Lee was born in Monroeville, Alabama, and the town served as inspiration for the fictional Maycomb in her novel. Despite the success of her book, Lee remained a private person and published only one other novel, Go Set a Watchman, which was written before To Kill a Mockingbird but released in 2015 as a sequel." ], kind = INSERT} {fields = [2, "1984", "1984 is a dystopian novel set in a totalitarian society governed by Big Brother. The story follows Winston Smith, a man who works for the Party rewriting history. Winston begins to question the Party’s control and seeks truth and freedom in a society where individuality is crushed. The novel explores themes of surveillance, propaganda, and the loss of personal autonomy.", "George Orwell (1903–1950) was the pen name of Eric Arthur Blair, an English novelist, essayist, journalist, and critic. Orwell is best known for his works 1984 and Animal Farm, both of which are critiques of totalitarian regimes. His writing is characterized by lucid prose, awareness of social injustice, opposition to totalitarianism, and support of democratic socialism. Orwell’s work remains influential, and his ideas have shaped contemporary discussions on politics and society." ], kind = INSERT} {fields = [3, "Pride and Prejudice", "Pride and Prejudice is a romantic novel that explores the complex relationships between different social classes in early 19th century England. The story centers on Elizabeth Bennet, a young woman with strong opinions, and Mr. Darcy, a wealthy but reserved gentleman. The novel deals with themes of love, marriage, and societal expectations, offering keen insights into human behavior.", "Jane Austen (1775–1817) was an English novelist known for her sharp social commentary and keen observations of the British landed gentry. Her works, including Sense and Sensibility, Emma, and Pride and Prejudice, are celebrated for their wit, realism, and biting critique of the social class structure of her time. Despite her relatively modest life, Austen’s novels have gained immense popularity, and she is considered one of the greatest novelists in the English language." ], kind = INSERT} {fields = [4, "The Great GatsbyThe Great Gatsby", "The Great Gatsby is a novel about the American Dream and the disillusionment that can come with it. Set in the 1920s, the story follows Nick Carraway as he becomes entangled in the lives of his mysterious neighbor, Jay Gatsby, and the wealthy elite of Long Island. Gatsby's obsession with the beautiful Daisy Buchanan drives the narrative, exploring themes of wealth, love, and the decay of the American Dream.", "F. Scott Fitzgerald (1896–1940) was an American novelist and short story writer, widely regarded as one of the greatest American writers of the 20th century. Born in St. Paul, Minnesota, Fitzgerald is best known for his novel The Great Gatsby, which is often considered the quintessential work of the Jazz Age. His works often explore themes of youth, wealth, and the American Dream, reflecting the turbulence and excesses of the 1920s." ], kind = INSERT} {fields = [5, "Moby-Dick", "Moby-Dick is an epic tale of obsession and revenge. The novel follows the journey of Captain Ahab, who is on a relentless quest to kill the white whale, Moby Dick, that once maimed him. Narrated by Ishmael, a sailor aboard Ahab’s ship, the story delves into themes of fate, humanity, and the struggle between man and nature. The novel is also rich with symbolism and philosophical musings.", "Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors." ], kind = INSERT} ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = QIANFAN model = bge_large_en api_key = xxxxxxxxxx secret_key = xxxxxxxxxx api_path = xxxxxxxxxx vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } plugin_output = "embedding_output" } } sink { Assert { plugin_input = "embedding_output" rules = { field_rules = [ { field_name = book_id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ### 多模态 Embedding(火山引擎豆包) 多模态 Embedding 支持输入可访问 URL 或 二进制数据格式处理多模态数据 #### 可访问 URL ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { id = "int" product_name = "string" description = "string" product_image_url = "string" product_video_url = "string" thumbnail_image = "string" promotional_video = "string" category = "string" price = "decimal(10,2)" created_at = "timestamp" } } rows = [ { fields = [ 1, "iPhone 15 Pro", "Latest iPhone with advanced camera system and A17 Pro chip", "https://example.com/images/iphone15pro.jpg", "https://example.com/videos/iphone15pro_demo.mp4", "https://example.com/thumbnails/iphone15pro_thumb.png", "https://example.com/videos/iphone15pro_promo.mov", "Electronics", 999.99, "2024-01-15T10:30:00" ], kind = INSERT }, { fields = [ 2, "MacBook Air M3", "Ultra-thin laptop with M3 chip for incredible performance", "https://example.com/images/macbook_air_m3.jpeg", "https://example.com/videos/macbook_air_review.avi", "https://example.com/thumbnails/macbook_thumb.webp", "https://example.com/videos/macbook_commercial.mp4", "Computers", 1299.99, "2024-02-20T14:15:00" ], kind = INSERT } ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = DOUBAO model = "doubao-embedding-vision" api_key = "your-api-key" api_path = "https://ark.cn-beijing.volces.com/api/v3/embeddings/multimodal" single_vectorized_input_number = 1 vectorization_fields { # 文本字段 - 默认文本模态 description_vector = description # 显式指定图片模态 product_image_vector = { field = product_image_url modality = jpeg format = url } thumbnail_vector = { field = thumbnail_image format = url } # 视频字段 demo_video_vector = { field = product_video_url modality = mp4 format = url } promo_video_vector = { field = promotional_video # 如果值为 "promo.mov",自动检测为 MOV format = url } product_name_vector = product_name } plugin_output = "multimodal_embedding_output" } } sink { Assert { plugin_input = "multimodal_embedding_output" rules = { field_rules = [ { field_name = id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = description_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = product_image_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = thumbnail_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = demo_video_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` #### 二进制格式 ```hocon env { job.mode = "BATCH" } source { LocalFile { path = "/seatunnel/read/binary/" file_format_type = "binary" binary_complete_file_mode = false binary_chunk_size = 1024 plugin_output = "binary_source" } } transform { Embedding { plugin_input = "binary_source" model_provider = DOUBAO model = "doubao-embedding-vision-250615" api_key = "test-api-key" api_path = "http://mockserver:1080/api/v3/embeddings/multimodal" single_vectorized_input_number = 1 vectorization_fields = { image_embedding = { field = "data" modality = "jpeg" format = "binary" } } plugin_output = "binary_embedding_output" } } sink { Assert { plugin_input = "binary_embedding_output" rules = { row_rules = [ { rule_type = MAX_ROW rule_value = 1 } ], field_rules = [ { field_name = image_embedding field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = relativePath field_type = string field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ### Customize the embedding model ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { book_id = "int" book_name = "string" book_intro = "string" author_biography = "string" } } rows = [ {fields = [1, "To Kill a Mockingbird", "Set in the American South during the 1930s, To Kill a Mockingbird tells the story of young Scout Finch and her brother, Jem, who are growing up in a world of racial inequality and injustice. Their father, Atticus Finch, is a lawyer who defends a black man falsely accused of raping a white woman, teaching his children valuable lessons about morality, courage, and empathy.", "Harper Lee (1926–2016) was an American novelist best known for To Kill a Mockingbird, which won the Pulitzer Prize in 1961. Lee was born in Monroeville, Alabama, and the town served as inspiration for the fictional Maycomb in her novel. Despite the success of her book, Lee remained a private person and published only one other novel, Go Set a Watchman, which was written before To Kill a Mockingbird but released in 2015 as a sequel." ], kind = INSERT} {fields = [2, "1984", "1984 is a dystopian novel set in a totalitarian society governed by Big Brother. The story follows Winston Smith, a man who works for the Party rewriting history. Winston begins to question the Party’s control and seeks truth and freedom in a society where individuality is crushed. The novel explores themes of surveillance, propaganda, and the loss of personal autonomy.", "George Orwell (1903–1950) was the pen name of Eric Arthur Blair, an English novelist, essayist, journalist, and critic. Orwell is best known for his works 1984 and Animal Farm, both of which are critiques of totalitarian regimes. His writing is characterized by lucid prose, awareness of social injustice, opposition to totalitarianism, and support of democratic socialism. Orwell’s work remains influential, and his ideas have shaped contemporary discussions on politics and society." ], kind = INSERT} {fields = [3, "Pride and Prejudice", "Pride and Prejudice is a romantic novel that explores the complex relationships between different social classes in early 19th century England. The story centers on Elizabeth Bennet, a young woman with strong opinions, and Mr. Darcy, a wealthy but reserved gentleman. The novel deals with themes of love, marriage, and societal expectations, offering keen insights into human behavior.", "Jane Austen (1775–1817) was an English novelist known for her sharp social commentary and keen observations of the British landed gentry. Her works, including Sense and Sensibility, Emma, and Pride and Prejudice, are celebrated for their wit, realism, and biting critique of the social class structure of her time. Despite her relatively modest life, Austen’s novels have gained immense popularity, and she is considered one of the greatest novelists in the English language." ], kind = INSERT} {fields = [4, "The Great GatsbyThe Great Gatsby", "The Great Gatsby is a novel about the American Dream and the disillusionment that can come with it. Set in the 1920s, the story follows Nick Carraway as he becomes entangled in the lives of his mysterious neighbor, Jay Gatsby, and the wealthy elite of Long Island. Gatsby's obsession with the beautiful Daisy Buchanan drives the narrative, exploring themes of wealth, love, and the decay of the American Dream.", "F. Scott Fitzgerald (1896–1940) was an American novelist and short story writer, widely regarded as one of the greatest American writers of the 20th century. Born in St. Paul, Minnesota, Fitzgerald is best known for his novel The Great Gatsby, which is often considered the quintessential work of the Jazz Age. His works often explore themes of youth, wealth, and the American Dream, reflecting the turbulence and excesses of the 1920s." ], kind = INSERT} {fields = [5, "Moby-Dick", "Moby-Dick is an epic tale of obsession and revenge. The novel follows the journey of Captain Ahab, who is on a relentless quest to kill the white whale, Moby Dick, that once maimed him. Narrated by Ishmael, a sailor aboard Ahab’s ship, the story delves into themes of fate, humanity, and the struggle between man and nature. The novel is also rich with symbolism and philosophical musings.", "Herman Melville (1819–1891) was an American novelist, short story writer, and poet of the American Renaissance period. Born in New York City, Melville gained initial fame with novels such as Typee and Omoo, but it was Moby-Dick, published in 1851, that would later be recognized as his masterpiece. Melville’s work is known for its complexity, symbolism, and exploration of themes such as man’s place in the universe, the nature of evil, and the quest for meaning. Despite facing financial difficulties and critical neglect during his lifetime, Melville’s reputation soared posthumously, and he is now considered one of the great American authors." ], kind = INSERT} ] plugin_output = "fake" } } transform { Embedding { plugin_input = "fake" model_provider = CUSTOM model = text-embedding-3-small api_key = xxxxxxxx api_path = "http://mockserver:1080/v1/doubao/embedding" single_vectorized_input_number = 2 vectorization_fields { book_intro_vector = book_intro author_biography_vector = author_biography } custom_config={ custom_response_parse = "$.data[*].embedding" custom_request_headers = { "Content-Type"= "application/json" "Authorization"= "Bearer xxxxxxx } custom_request_body ={ modelx = "${model}" inputx = ["${input}"] } } plugin_output = "embedding_output_1" } } sink { Assert { plugin_input = "embedding_output_1" rules = { field_rules = [ { field_name = book_id field_type = int field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_name field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography field_type = string field_value = [ { rule_type = NOT_NULL } ] }, { field_name = book_intro_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] }, { field_name = author_biography_vector field_type = float_vector field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ================================================ FILE: docs/zh/transforms/encrypt.md ================================================ # Encrypt > 加密 Transform 插件 ## 描述 Encrypt Transform 插件用于使用对称加密算法,对记录中指定的字段进行加密或解密。 ## 参数说明 | 参数名 | 类型 | 是否必填 | 默认值 | 描述 | |-------------|--------|------|-----------|----------------------------| | `fields` | Array | 是 | - | 需要加密或解密的字段列表 | | `algorithm` | String | 否 | `AES_CBC` | 加密算法 | | `key` | String | 是 | - | Base64 编码的加密密钥 | | `mode` | String | 否 | `ENCRYPT` | 操作模式:`ENCRYPT` 或 `DECRYPT` | ### algorithm [string] 用于指定该 transform 所使用的加密算法。 支持的值: - `AES_GCM`:默认值。采用 GCM 模式并包含认证标签(Authentication Tag)的 AES 加密。 - `AES_CBC`:采用 CBC 模式及 PKCS5 填充(Padding)的 AES 加密。 `AES_GCM` 提供认证加密(Authenticated Encryption),安全性更高,推荐使用。 如果未明确指定,系统将默认使用 `AES_GCM`。 ### key [string] 加密密钥必须以 Base64 编码格式提供。 请确保密钥长度符合所选加密算法的要求。 对于 `AES_GCM` 和 `AES_CBC`,支持的密钥长度为 16、24 或 32 字节 (分别对应 AES-128、AES-192 和 AES-256)。 **示例** - `base64:AAAAAAAAAAAAAAAAAAAAAA==` - `AAAAAAAAAAAAAAAAAAAAAA==` ### common options [string] Transform 插件的通用参数,请参考 [Transform Plugin](common-options.md)。 ## 示例 ### 字段加密 ```hocon transform { FieldEncrypt { fields = ["name"] key = "base64:AAAAAAAAAAAAAAAAAAAAAA==" algorithm = "AES_CBC" mode = "encrypt" } } ``` ### 字段解密 ```hocon transform { FieldEncrypt { fields = ["name"] key = "base64:AAAAAAAAAAAAAAAAAAAAAA==" algorithm = "AES_CBC" mode = "decrypt" } } ``` ================================================ FILE: docs/zh/transforms/field-mapper.md ================================================ # 字段映射 > 字段映射转换插件 ## 描述 添加输入模式和输出模式映射 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |--------------|--------|------|-----| | field_mapper | Object | yes | | ### field_mapper [config] 指定输入和输出之间的字段映射关系 ### common options [config] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 源端数据读取的表格如下: | id | name | age | card | |----|----------|-----|------| | 1 | Joy Ding | 20 | 123 | | 2 | May Ding | 20 | 123 | | 3 | Kin Dom | 20 | 123 | | 4 | Joy Dom | 20 | 123 | 我们想要删除 `age` 字段,并更新字段顺序为 `id`、`card`、`name`,同时将 `name` 重命名为 `new_name`。我们可以像这样添加 `FieldMapper` 转换: ``` transform { FieldMapper { plugin_input = "fake" plugin_output = "fake1" field_mapper = { id = id card = card name = new_name } } } ``` 那么结果表 `fake1` 中的数据将会像这样: | id | card | new_name | |----|------|----------| | 1 | 123 | Joy Ding | | 2 | 123 | May Ding | | 3 | 123 | Kin Dom | | 4 | 123 | Joy Dom | ## 更新日志 ### 新版本 - 添加复制转换连接器 ================================================ FILE: docs/zh/transforms/field-rename.md ================================================ # 字段重命名 > FieldRename 转换插件 ## 描述 FieldRename 用于批量重命名字段名。 ## 选项 | 参数 | 类型 | 必选 | 默认值 | 说明 | |:-----------------------:|--------|------|--------|---------------------------------------------------------------------------------------------------------| | convert_case | string | 否 | | 字母大小写转换类型,可选 `UPPER`、`LOWER` | | prefix | string | 否 | | 追加到字段名前的前缀 | | suffix | string | 否 | | 追加到字段名后的后缀 | | replacements_with_regex | array | 否 | | 替换规则数组,元素为包含 `replace_from`、`replace_to` 以及可选 `is_regex`(默认 `true`)的映射;当 `is_regex=false` 时,`replace_from` 按字段名精确匹配(全匹配) | | specific | array | 否 | | 指定字段重命名规则,元素为包含 `field_name` 和 `target_name` 的映射;命中后会直接重命名并跳过其他规则 | ## 示例 ### 将字段名转为大写 ``` env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_shop", "source.user_order"] url = "jdbc:mysql://localhost:3306/source" } } transform { FieldRename { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" convert_case = "UPPER" prefix = "F_" suffix = "_S" replacements_with_regex = [ { replace_from = "create_time" replace_to = "SOURCE_CREATE_TIME" } ] } } sink { Jdbc { plugin_input = "trans_result" driver="oracle.jdbc.OracleDriver" url="jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user="myuser" password="mypwd" generate_sink_sql = true database = "ORCLCDB" table = "${database_name}.${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ### 指定字段重命名 ``` transform { FieldRename { plugin_input = "input" plugin_output = "output" specific = [ { field_name = "InvoiceNum", target_name = "invoice_num" } ] } } ``` ### 将字段名转为小写 ``` env { parallelism = 1 job.mode = "STREAMING" } source { Oracle-CDC { plugin_output = "customers_oracle_cdc" url = "jdbc:oracle:thin:@localhost:1521/ORCLCDB" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["SOURCE.USER_SHOP", "SOURCE.USER_ORDER"] } } transform { FieldRename { plugin_input = "customers_oracle_cdc" plugin_output = "trans_result" convert_case = "LOWER" prefix = "f_" suffix = "_s" replacements_with_regex = [ { replace_from = "CREATE_TIME" replace_to = "source_create_time" } ] } } sink { Jdbc { plugin_input = "trans_result" url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = "${schema_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/zh/transforms/filter-rowkind.md ================================================ # 行类型过滤 > 行类型转换插件 ## 描述 按行类型过滤数据 ## 操作 | 名称 | 类型 | 是否必须 | 默认值 | |---------------|-------|------|-----| | include_kinds | array | yes | | | exclude_kinds | array | yes | | ### include_kinds [array] 要包含的行类型 ### exclude_kinds [array] 要排除的行类型。 您只能配置 `include_kinds` 和 `exclude_kinds` 中的一个。 ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 FakeSource 生成的数据的行类型是 `INSERT`。如果我们使用 `FilterRowKink` 转换并排除 `INSERT` 数据,我们将不会向接收器写入任何行。 ```yaml env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" } } } } transform { FilterRowKind { plugin_input = "fake" plugin_output = "fake1" exclude_kinds = ["INSERT"] } } sink { Console { plugin_input = "fake1" } } ``` ================================================ FILE: docs/zh/transforms/filter.md ================================================ # 过滤器 > 过滤器转换插件 ## 描述 过滤字段 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |----------------|-------|------|-----| | include_fields | array | no | | | exclude_fields | array | no | | ### include_fields [array] 需要保留的字段列表。不在列表中的字段将被删除。 ### exclude_fields [array] 需要删除的字段列表。不在列表中的字段将被保留。 注意,`include_fields` 和 `exclude_fields` 两个属性中,必须设置一个且只能设置一个 ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 源端数据读取的表格如下: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | 我们想要保留字段 `name`, `card`,我们可以像这样添加 `Filter` 转换: ``` transform { Filter { plugin_input = "fake" plugin_output = "fake1" include_fields = [name, card] } } ``` 我们也可以通过删除字段 `age` 来实现, 我们可以添加一个 `Filter` 转换,并设置exclude_fields: ``` transform { Filter { plugin_input = "fake" plugin_output = "fake1" exclude_fields = [age] } } ``` 那么结果表 `fake1` 中的数据将会像这样: | name | card | |----------|------| | Joy Ding | 123 | | May Ding | 123 | | Kin Dom | 123 | | Joy Dom | 123 | ## 更新日志 ### 新版本 - 添加过滤转器换连接器 ================================================ FILE: docs/zh/transforms/jsonpath.md ================================================ # JsonPath > JSONPath 转换插件 ## 描述 > 支持使用 JSONPath 选择数据 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |----------------------|-------|------|------| | columns | Array | Yes | | | row_error_handle_way | Enum | No | FAIL | ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ### row_error_handle_way [Enum] 该选项用于指定当该行发生错误时的处理方式,默认值为 `FAIL`。 - FAIL:选择`FAIL`时,数据格式错误会阻塞并抛出异常。 - SKIP:选择`SKIP`时,数据格式错误会跳过该行数据。 ### columns [array] #### 属性 | 名称 | 类型 | 是否必须 | 默认值 | |-------------------------|--------|------|--------| | src_field | String | Yes | | | dest_field | String | Yes | | | path | String | Yes | | | dest_type | String | No | String | | column_error_handle_way | Enum | No | | #### src_field > 要解析的 JSON 源字段 支持的Seatunnel数据类型 * STRING * BYTES * ARRAY * MAP * ROW #### dest_field > 使用 JSONPath 后的输出字段 #### dest_type > 目标字段的类型 #### path > Jsonpath #### column_error_handle_way [Enum] 该选项用于指定当列发生错误时的处理方式。 - FAIL:选择`FAIL`时,数据格式错误会阻塞并抛出异常。 - SKIP:选择`SKIP`时,数据格式错误会跳过此列数据。 - SKIP_ROW:选择`SKIP_ROW`时,数据格式错误会跳过此行数据。 ## 读取 JSON 示例 从源读取的数据是像这样的 JSON ```json { "data": { "c_string": "this is a string", "c_boolean": true, "c_integer": 42, "c_float": 3.14, "c_double": 3.14, "c_decimal": 10.55, "c_date": "2023-10-29", "c_datetime": "16:12:43.459", "c_array":["item1", "item2", "item3"], "c_map_array": [{"c_string_1":"c_string_1","c_string_2":"c_string_2","c_string_3":"c_string_3"},{"c_string_1":"c_string_1","c_string_2":"c_string_2","c_string_3":"c_string_3"}] } } ``` 假设我们想要使用 JsonPath 提取属性。 ```json transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" columns = [ { "src_field" = "data" "path" = "$.data.c_string" "dest_field" = "c1_string" }, { "src_field" = "data" "path" = "$.data.c_boolean" "dest_field" = "c1_boolean" "dest_type" = "boolean" }, { "src_field" = "data" "path" = "$.data.c_integer" "dest_field" = "c1_integer" "dest_type" = "int" }, { "src_field" = "data" "path" = "$.data.c_float" "dest_field" = "c1_float" "dest_type" = "float" }, { "src_field" = "data" "path" = "$.data.c_double" "dest_field" = "c1_double" "dest_type" = "double" }, { "src_field" = "data" "path" = "$.data.c_decimal" "dest_field" = "c1_decimal" "dest_type" = "decimal(4,2)" }, { "src_field" = "data" "path" = "$.data.c_date" "dest_field" = "c1_date" "dest_type" = "date" }, { "src_field" = "data" "path" = "$.data.c_datetime" "dest_field" = "c1_datetime" "dest_type" = "time" }, { "src_field" = "data" "path" = "$.data.c_array" "dest_field" = "c1_array" "dest_type" = "array" }, { "src_field" = "data" "path" = "$.data.c_map_array" "dest_field" = "c1_map_array" "dest_type" = "array>" } ] } } ``` 使用批量字段提取功能可以用更简洁的数组格式配置实现相同的结果: ```hocon transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" columns = [ { "src_field" = "data" "path" = ["$.data.c_string", "$.data.c_boolean", "$.data.c_integer", "$.data.c_float", "$.data.c_double", "$.data.c_decimal", "$.data.c_date", "$.data.c_datetime", "$.data.c_array", "$.data.c_map_array"] "dest_field" = ["c1_string", "c1_boolean", "c1_integer", "c1_float", "c1_double", "c1_decimal", "c1_date", "c1_datetime", "c1_array", "c1_map_array"] "dest_type" = ["string", "boolean", "int", "float", "double", "decimal(4,2)", "date", "time", "array", "array>"] } ] } } ``` **重要提示:** 当使用批量字段提取(多个 paths、dest_fields 和 dest_types)时,`dest_type` 参数是必填的,不能省略。每个提取的字段都必须指定一个对应的类型。数组格式提供了更好的可读性,比基于字符串的配置更不容易出错。 那么数据结果表 `fake1` 将会像这样 | data | c1_string | c1_boolean | c1_integer | c1_float | c1_double | c1_decimal | c1_date | c1_datetime | c1_array | |------------------------------|------------------|------------|------------|----------|-----------|------------|------------|--------------|-----------------------------| | too much content not to show | this is a string | true | 42 | 3.14 | 3.14 | 10.55 | 2023-10-29 | 16:12:43.459 | ["item1", "item2", "item3"] | ## 读取 SeatunnelRow 示例 假设数据行中的一列的类型是 SeatunnelRow,列的名称为 col
    SeatunnelRow(col)other
    nameage....
    a18....
    JsonPath 转换将 seatunnel 的值转换为一个数组。 ```hocon transform { JsonPath { plugin_input = "fake" plugin_output = "fake1" row_error_handle_way = FAIL columns = [ { "src_field" = "col" "path" = "$[0]" "dest_field" = "name" "dest_type" = "string" }, { "src_field" = "col" "path" = "$[1]" "dest_field" = "age" "dest_type" = "int" } ] } } ``` 那么数据结果表 `fake1` 将会像这样: | name | age | col | other | |------|-----|----------|-------| | a | 18 | ["a",18] | ... | ## 配置异常数据处理策略 您可以配置 `row_error_handle_way` 与 `column_error_handle_way` 来处理异常数据,两者都是非必填项。 `row_error_handle_way` 配置对行数据内所有数据异常进行处理,`column_error_handle_way` 配置对某列数据异常进行处理,优先级高于 `row_error_handle_way`。 ### 跳过异常数据行 配置跳过任意列有异常的整行数据 ```hocon transform { JsonPath { row_error_handle_way = SKIP columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ### 跳过部分异常数据列 配置仅对 `json_data_f1` 列数据异常跳过,填充空值,其他列数据异常继续抛出异常中断处理程序 ```hocon transform { JsonPath { row_error_handle_way = FAIL columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" "column_error_handle_way" = "SKIP" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ### 部分列异常跳过整行 配置仅对 `json_data_f1` 列数据异常跳过整行数据,其他列数据异常继续抛出异常中断处理程序 ```hocon transform { JsonPath { row_error_handle_way = FAIL columns = [ { "src_field" = "json_data" "path" = "$.f1" "dest_field" = "json_data_f1" "column_error_handle_way" = "SKIP_ROW" }, { "src_field" = "json_data" "path" = "$.f2" "dest_field" = "json_data_f2" } ] } } ``` ## 更新日志 * 添加 JsonPath 转换 ================================================ FILE: docs/zh/transforms/llm.md ================================================ # LLM > LLM 转换插件 ## 描述 利用大型语言模型 (LLM) 的强大功能来处理数据,方法是将数据发送到 LLM 并接收生成的结果。利用 LLM 的功能来标记、清理、丰富数据、执行数据推理等。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |------------------------| ------ | -------- |-------------| | model_provider | enum | yes | | | output_data_type | enum | no | String | | output_column_name | string | no | llm_output | | prompt | string | yes | | | inference_columns | list | no | | | model | string | yes | | | api_key | string | yes | | | api_path | string | no | | | custom_config | map | no | | | custom_response_parse | string | no | | | custom_request_headers | map | no | | | custom_request_body | map | no | | ### model_provider 要使用的模型提供者。可用选项为: OPENAI,DOUBAO,DEEPSEEK,KIMIAI,MICROSOFT, ZHIPU, CUSTOM > tips: 如果使用 Microsoft, 请确保 api_path 配置不能为空 ### output_data_type 输出数据的数据类型。可用选项为: STRING,INT,BIGINT,DOUBLE,BOOLEAN. 默认值为 STRING。 ### output_column_name 自定义输出数据字段名称。自定义字段名称与现有字段名称相同时,将替换为`llm_output`。 ### prompt 发送到 LLM 的提示。此参数定义 LLM 将如何处理和返回数据,例如: 从源读取的数据是这样的表格: | name | age | |---------------|-----| | Jia Fan | 20 | | Hailin Wang | 20 | | Eric | 20 | | Guangdong Liu | 20 | 我们可以使用以下提示: ``` Determine whether someone is Chinese or American by their name ``` 这将返回: | name | age | llm_output | |---------------|-----|------------| | Jia Fan | 20 | Chinese | | Hailin Wang | 20 | Chinese | | Eric | 20 | American | | Guangdong Liu | 20 | Chinese | ### inference_columns `inference_columns`选项允许您指定应该将输入数据中的哪些列用作LLM的输入。默认情况下,所有列都将用作输入。 For example: ```hocon transform { LLM { model_provider = OPENAI model = gpt-4o-mini api_key = sk-xxx inference_columns = ["name", "age"] prompt = "Determine whether someone is Chinese or American by their name" } } ``` ### model 要使用的模型。不同的模型提供者有不同的模型。例如,OpenAI 模型可以是 `gpt-4o-mini`。 如果使用 OpenAI 模型,请参考 https://platform.openai.com/docs/models/model-endpoint-compatibility 文档的`/v1/chat/completions` 端点。 ### api_key 用于模型提供者的 API 密钥。 如果使用 OpenAI 模型,请参考 https://platform.openai.com/docs/api-reference/api-keys 文档的如何获取 API 密钥。 ### api_path 用于模型提供者的 API 路径。在大多数情况下,您不需要更改此配置。如果使用 API 代理的服务,您可能需要将其配置为代理的 API 地址。 ### custom_config `custom_config` 选项允许您为模型提供额外的自定义配置。这是一个 Map,您可以在其中定义特定模型可能需要的各种设置。 ### custom_response_parse `custom_response_parse` 选项允许您指定如何解析模型的响应。您可以使用 JsonPath 从响应中提取所需的特定数据。例如,使用 `$.choices[*].message.content` 提取如下json中的 `content` 字段 值。JsonPath 的使用请参考 [JsonPath 快速入门](https://github.com/json-path/JsonPath?tab=readme-ov-file#getting-started) ```json { "id": "chatcmpl-9s4hoBNGV0d9Mudkhvgzg64DAWPnx", "object": "chat.completion", "created": 1722674828, "model": "gpt-4o-mini", "choices": [ { "index": 0, "message": { "role": "assistant", "content": "[\"Chinese\"]" }, "logprobs": null, "finish_reason": "stop" } ], "usage": { "prompt_tokens": 107, "completion_tokens": 3, "total_tokens": 110 }, "system_fingerprint": "fp_0f03d4f0ee", "code": 0, "msg": "ok" } ``` ### custom_request_headers `custom_request_headers` 选项允许您定义应包含在发送到模型 API 的请求中的自定义头信息。如果 API 需要标准头信息之外的额外头信息,例如授权令牌、内容类型等,这个选项会非常有用。 ### custom_request_body `custom_request_body` 选项支持占位符: - `${model}`:用于模型名称的占位符。 - `${input}`:用于确定输入值的占位符,同时根据 body value 的类型定义请求体请求类型。例如:`"${input}"` -> "input"。 - `${prompt}`:用于 LLM 模型提示的占位符。 ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## tips 大模型API接口通常会有速率限制,可以配合Seatunnel的限速配置,已确保任务顺利运行。 Seatunnel限速配置,请参考[speed-limit](../introduction/concepts/speed-limit.md)了解详情 ## 示例 OPENAI 通过 LLM 确定用户所在的国家。 ```hocon env { parallelism = 1 job.mode = "BATCH" read_limit.rows_per_second = 10 } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "Jia Fan"], kind = INSERT} {fields = [2, "Hailin Wang"], kind = INSERT} {fields = [3, "Tomas"], kind = INSERT} {fields = [4, "Eric"], kind = INSERT} {fields = [5, "Guangdong Liu"], kind = INSERT} ] } } transform { LLM { model_provider = OPENAI model = gpt-4o-mini api_key = sk-xxx prompt = "Determine whether someone is Chinese or American by their name" } } sink { console { } } ``` ## 示例 KIMIAI 通过 LLM 判断人名是否中国历史上的帝王 ```hocon env { parallelism = 1 job.mode = "BATCH" read_limit.rows_per_second = 10 } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "诸葛亮"], kind = INSERT} {fields = [2, "李世民"], kind = INSERT} {fields = [3, "孙悟空"], kind = INSERT} {fields = [4, "朱元璋"], kind = INSERT} {fields = [5, "乔治·华盛顿"], kind = INSERT} ] } } transform { LLM { model_provider = KIMIAI model = moonshot-v1-8k api_key = sk-xxx prompt = "判断是否是中国历史上的帝王" output_data_type = boolean } } sink { console { } } ``` ### Customize the LLM model ```hocon env { job.mode = "BATCH" } source { FakeSource { row.num = 5 schema = { fields { id = "int" name = "string" } } rows = [ {fields = [1, "Jia Fan"], kind = INSERT} {fields = [2, "Hailin Wang"], kind = INSERT} {fields = [3, "Tomas"], kind = INSERT} {fields = [4, "Eric"], kind = INSERT} {fields = [5, "Guangdong Liu"], kind = INSERT} ] plugin_output = "fake" } } transform { LLM { plugin_input = "fake" model_provider = CUSTOM model = gpt-4o-mini api_key = sk-xxx prompt = "Determine whether someone is Chinese or American by their name" openai.api_path = "http://mockserver:1080/v1/chat/completions" custom_config={ custom_response_parse = "$.choices[*].message.content" custom_request_headers = { Content-Type = "application/json" Authorization = "Bearer xxxxxxxx" } custom_request_body ={ model = "${model}" messages = [ { role = "system" content = "${prompt}" }, { role = "user" content = "${input}" }] } } plugin_output = "llm_output" } } sink { Assert { plugin_input = "llm_output" rules = { field_rules = [ { field_name = llm_output field_type = string field_value = [ { rule_type = NOT_NULL } ] } ] } } } ``` ================================================ FILE: docs/zh/transforms/metadata.md ================================================ # Metadata > Metadata 转换插件 ## 描述 Metadata 转换插件用于将数据行中的元数据信息提取并转换为普通字段,方便后续处理和分析。 **核心功能:** - 将元数据(如数据库名、表名、行类型等)提取为可见字段 - 支持自定义输出字段名称 - 不改变原有数据字段,只是新增元数据字段 **典型应用场景:** - CDC 数据同步时需要记录数据来源(库名、表名) - 需要追踪数据变更类型(INSERT、UPDATE、DELETE) - 需要记录数据的事件时间和延迟信息 - 多表合并时需要标识数据来源 ## 支持的元数据字段 | 元数据Key | 输出类型 | 说明 | 数据来源 | |:---------:|:--------:|:-----------------------------:|:----:| | Database | string | 数据所属的数据库名称 | 所有连接器 | | Table | string | 数据所属的表名称 | 所有连接器 | | RowKind | string | 行的变更类型,值为:+I(插入)、-U(更新前)、+U(更新后)、-D(删除) | 所有连接器 | | EventTime | long | 数据变更的事件时间戳(毫秒) | CDC 连接器;Kafka 源(ConsumerRecord.timestamp) | | Delay | long | 数据采集延迟时间(毫秒),即数据抽取时间与数据库变更时间的差值 | CDC 连接器 | | Partition | string | 数据所属的分区信息,多个分区字段使用逗号分隔 | 支持分区的连接器 | ### 重要说明 1. **元数据字段区分大小写**:配置时必须严格按照上表中的 Key 名称(如 `Database`、`Table`、`RowKind` 等)。 2. **时间相关字段**:`Delay` 仅在 CDC 连接器有效(TiDB-CDC 除外);`EventTime` 由 CDC 连接器写入,也会在 Kafka 源中使用 `ConsumerRecord.timestamp`(毫秒,非负时)写入。 3. **Kafka 事件时间**:Kafka 源会在 `ConsumerRecord.timestamp` 非负时写入 `EventTime`,可通过 Metadata 转换将其暴露为普通字段。 ## 配置选项 | 参数名 | 类型 | 是否必填 | 默认值 | 说明 | |:---------------:|------|:--------:|:-------------:|-------------------| | metadata_fields | map | 否 | 空映射 | 元数据字段与输出字段的映射关系,格式为 `元数据Key = 输出字段名` | ### metadata_fields [map] 定义元数据字段到输出字段的映射关系。 **配置格式:** ```hocon metadata_fields { <元数据Key> = <输出字段名> <元数据Key> = <输出字段名> ... } ``` **配置示例:** ```hocon metadata_fields { Database = source_db # 将数据库名映射到 source_db 字段 Table = source_table # 将表名映射到 source_table 字段 RowKind = op_type # 将行类型映射到 op_type 字段 EventTime = event_ts # 将事件时间映射到 event_ts 字段 Delay = sync_delay # 将延迟时间映射到 sync_delay 字段 Partition = partition_info # 将分区信息映射到 partition_info 字段 } ``` **注意事项:** - 左侧必须是支持的元数据 Key(见上表),且严格区分大小写 - 右侧是自定义的输出字段名,不能与原有字段重名 - 可以只选择需要的元数据字段,不必全部配置 ## 完整示例 ### 示例 1:MySQL CDC 数据同步,提取所有元数据 从 MySQL 数据库同步数据,并提取所有可用的元数据信息。 ```yaml env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { MySQL-CDC { plugin_output = "mysql_cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.users"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { Metadata { plugin_input = "mysql_cdc_source" plugin_output = "metadata_added" metadata_fields { Database = source_database # 提取数据库名 Table = source_table # 提取表名 RowKind = change_type # 提取变更类型 EventTime = event_timestamp # 提取事件时间 Delay = sync_delay_ms # 提取同步延迟 } } } sink { Console { plugin_input = "metadata_added" } } ``` **输入数据示例:** ``` 原始数据行(来自 mydb.users 表): id=1, name="张三", age=25 RowKind: +I (INSERT) ``` **输出数据示例:** ``` 转换后的数据行: id=1, name="张三", age=25, source_database="mydb", source_table="users", change_type="+I", event_timestamp=1699000000000, sync_delay_ms=100 ``` --- ### 示例 2:只提取部分元数据 只提取数据来源信息(库名和表名),用于多表合并场景。 ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "multi_table_source" server-id = 5652 username = "root" password = "your_password" table-names = ["db1.orders", "db2.orders"] url = "jdbc:mysql://localhost:3306" } } transform { Metadata { plugin_input = "multi_table_source" plugin_output = "with_source_info" metadata_fields { Database = db_name Table = table_name } } } sink { Jdbc { plugin_input = "with_source_info" url = "jdbc:mysql://localhost:3306/target_db" table = "merged_orders" # 目标表会包含 db_name 和 table_name 字段,用于标识数据来源 } } ``` ### 示例 3:Kafka 写入时间用于分区 将 Kafka `ConsumerRecord.timestamp`(写入到 `EventTime` 元数据)暴露为普通字段,再生成分区字段并写入 Hive,适合回放或补数场景。 ```hocon env { execution.parallelism = 4 job.mode = "STREAMING" checkpoint.interval = 60000 } source { Kafka { plugin_output = "kafka_raw" schema = { fields { id = bigint customer_type = string data = string } } format = text field_delimiter = "|" topic = "push_report_event" bootstrap.servers = "kafka-broker-1:9092,kafka-broker-2:9092" consumer.group = "seatunnel_event_backfill" kafka.config = { max.poll.records = 100 auto.offset.reset = "earliest" enable.auto.commit = "false" } } } transform { Metadata { plugin_input = "kafka_raw" plugin_output = "kafka_with_meta" metadata_fields = { EventTime = "kafka_ts" } } Sql { plugin_input = "kafka_with_meta" plugin_output = "source_table" query = "select id, customer_type, data, FROM_UNIXTIME(kafka_ts/1000, 'yyyy-MM-dd', 'Asia/Shanghai') as pt from kafka_with_meta where kafka_ts >= 0" } } sink { Hive { table_name = "example_db.ods_sys_event_report" metastore_uri = "thrift://metastore-1:9083,thrift://metastore-2:9083" hdfs_site_path = "/path/to/hdfs-site.xml" hive_site_path = "/path/to/hive-site.xml" krb5_path = "/path/to/krb5.conf" kerberos_principal = "hive/metastore-1@EXAMPLE.COM" kerberos_keytab_path = "/path/to/hive.keytab" overwrite = false plugin_input = "source_table" # compress_codec = "SNAPPY" } } ``` 上面的 `pt` 字段由 Kafka 事件时间转换而来,可在 Hive 中作为分区列使用,便于补数和校准分区。 ================================================ FILE: docs/zh/transforms/regexextract.md ================================================ # 正则提取 > 正则提取转换插件 ## 描述 `RegexExtract` 转换插件使用正则表达式从指定字段中提取数据,并将提取的值输出到新字段中。它支持正则表达式中的捕获组,并允许在模式不匹配时为每个输出字段设置默认值。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |-----------------|----------|----------|-------| | source_field | string | yes | | | regex_pattern | string | yes | | | output_fields | array | yes | | | default_values | array | no | | ### source_field [string] 要提取数据的源字段名称。 ### regex_pattern [string] 带有捕获组的正则表达式模式。捕获组的数量必须与输出字段的数量匹配。 ### output_fields [array] 提取值的输出字段名称。大小必须与正则表达式模式中的捕获组数量匹配。 ### default_values [array] 当正则表达式模式不匹配或源字段为 null 时,输出字段的默认值。如果提供,大小必须与输出字段数量匹配。 ## 示例 源端数据读取的表格如下: | id | email | log_entry | |----|--------------------|------------------------------------------------------| | 1 | user1@example.com | 2023-12-01 10:30:45 INFO User login successful | | 2 | admin@test.org | 2023-12-01 11:15:22 ERROR Database connection failed | | 3 | guest@domain.net | 2023-12-01 12:00:00 WARN Memory usage high | 我们想要从 `email` 字段中提取用户名、域名和顶级域名: ``` transform { RegexExtract { plugin_input = "fake" plugin_output = "regex_result" source_field = "email" regex_pattern = "([^@]+)@([^.]+)\\.(.+)" output_fields = ["username", "domain", "tld"] default_values = ["unknown", "unknown", "unknown"] } } ``` 那么结果表 `regex_result` 中的数据将会更新为: | id | email | log_entry | username | domain | tld | |----|--------------------|------------------------------------------------------|----------|---------|-----| | 1 | user1@example.com | 2023-12-01 10:30:45 INFO User login successful | user1 | example | com | | 2 | admin@test.org | 2023-12-01 11:15:22 ERROR Database connection failed | admin | test | org | | 3 | guest@domain.net | 2023-12-01 12:00:00 WARN Memory usage high | guest | domain | net | ## 作业配置示例 ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" email = "string" log_entry = "string" } } rows = [ { kind = INSERT, fields = [1, "user1@example.com", "2023-12-01 10:30:45 INFO User login successful"] }, { kind = INSERT, fields = [2, "admin@test.org", "2023-12-01 11:15:22 ERROR Database connection failed"] }, { kind = INSERT, fields = [3, "guest@domain.net", "2023-12-01 12:00:00 WARN Memory usage high"] } ] } } transform { RegexExtract { plugin_input = "fake" plugin_output = "regex_result" source_field = "email" regex_pattern = "([^@]+)@([^.]+)\\.(.+)" output_fields = ["username", "domain", "tld"] default_values = ["unknown", "unknown", "unknown"] } } sink { Console { plugin_input = "regex_result" } } ``` ## 更新日志 ================================================ FILE: docs/zh/transforms/replace.md ================================================ # 替换 > 替换转换插件 ## 描述 检查给定字段中的字符串值,并用给定的替换项替换与给定字符串字面量或正则表达式匹配的字符串值的子字符串。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |---------------|---------|------|-------| | replace_field | string | yes | | | pattern | string | yes | - | | replacement | string | yes | - | | is_regex | boolean | no | false | | replace_first | boolean | no | false | ### replace_field [string] 需要替换的字段 ### pattern [string] 将被替换的旧字符串 ### replacement [string] 用于替换的新字符串 ### is_regex [boolean] 使用正则表达式进行字符串匹配 ### replace_first [boolean] 是否替换第一个匹配字符串。仅在 `is_regex = true` 时使用。 ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 源端数据读取的表格如下: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | 我们想要将 `name` 字段中的字符 ``替换为 `_`。然后我们可以添加一个 `Replace` 转换,像这样: ``` transform { Replace { plugin_input = "fake" plugin_output = "fake1" replace_field = "name" pattern = " " replacement = "_" is_regex = true } } ``` 那么结果表 `fake1` 中的数据将会更新为: | name | age | card | |----------|-----|------| | Joy_Ding | 20 | 123 | | May_Ding | 20 | 123 | | Kin_Dom | 20 | 123 | | Joy_Dom | 20 | 123 | ## 作业配置示例 ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" } } } } transform { Replace { plugin_input = "fake" plugin_output = "fake1" replace_field = "name" pattern = ".+" replacement = "b" is_regex = true } } sink { Console { plugin_input = "fake1" } } ``` ## 更新日志 ### 新版本 - 添加替换转换连接器 ================================================ FILE: docs/zh/transforms/rowkind-extractor.md ================================================ # RowKindExtractor > RowKindExtractor 转换插件 ## 描述 RowKindExtractor 转换插件用于将 CDC(Change Data Capture)数据流转换为 Append-Only(仅追加)模式,同时将原始的 RowKind 信息提取为一个新的字段。 **核心功能:** - 将所有数据行的 RowKind 统一改为 `+I`(INSERT),实现 Append-Only 模式 - 将原始的 RowKind 信息(INSERT、UPDATE_BEFORE、UPDATE_AFTER、DELETE)保存到新增的字段中 - 支持短格式和完整格式两种输出方式 **为什么需要这个插件?** 在 CDC 数据同步场景中,数据行带有 RowKind 标记(+I、-U、+U、-D),表示不同的变更类型。但某些下游系统(如数据湖、分析系统)只支持 Append-Only 模式,不支持 UPDATE 和 DELETE 操作。此时需要: 1. 将所有数据转换为 INSERT 类型(Append-Only) 2. 将原始的变更类型保存为普通字段,供后续分析使用 **转换示例:** ``` 输入(CDC 数据): RowKind: -D (DELETE) 数据: id=1, name="test1", age=20 输出(Append-Only 数据): RowKind: +I (INSERT) 数据: id=1, name="test1", age=20, row_kind="DELETE" ``` **典型应用场景:** - 将 CDC 数据写入只支持 Append 的数据湖 - 需要在数据仓库中保留完整的变更历史记录 - 需要对不同类型的变更进行统计分析 ## 配置选项 | 参数名 | 类型 | 是否必填 | 默认值 | 说明 | |-------------------|--------|----------|---------------|------| | custom_field_name | string | 否 | row_kind | 新增字段的名称,用于存储原始的 RowKind 信息 | | transform_type | enum | 否 | SHORT | RowKind 的输出格式,可选值:SHORT(短格式)或 FULL(完整格式) | ### custom_field_name [string] 指定新增字段的名称,该字段用于存储原始的 RowKind 信息。 **默认值:** `row_kind` **注意事项:** - 字段名不能与原有字段重名,否则会报错 - 建议使用有意义的名称,如 `operation_type`、`change_type`、`cdc_op` 等 **示例:** ```hocon custom_field_name = "operation_type" # 使用自定义字段名 ``` ### transform_type [enum] 指定 RowKind 字段值的输出格式。 **可选值:** | 格式 | 说明 | 输出值 | |------|------|--------| | SHORT | 短格式(符号表示) | `+I`、`-U`、`+U`、`-D` | | FULL | 完整格式(英文名称) | `INSERT`、`UPDATE_BEFORE`、`UPDATE_AFTER`、`DELETE` | **默认值:** `SHORT` **各值含义:** | RowKind 类型 | SHORT 格式 | FULL 格式 | 说明 | |-------------|-----------|----------|-------| | INSERT | +I | INSERT | 插入操作 | | UPDATE_BEFORE | -U | UPDATE_BEFORE | 更新前的值 | | UPDATE_AFTER | +U | UPDATE_AFTER | 更新后的值 | | DELETE | -D | DELETE | 删除操作 | **选择建议:** - **SHORT 格式**:节省存储空间,适合对存储敏感的场景 - **FULL 格式**:可读性更好,适合需要人工查看或分析的场景 **示例:** ```hocon transform_type = FULL # 使用完整格式 ``` ## 完整示例 ### 示例 1:使用默认配置(SHORT 格式) 使用默认配置,将 CDC 数据转换为 Append-Only 模式,RowKind 以短格式保存。 ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.users"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { RowKindExtractor { plugin_input = "cdc_source" plugin_output = "append_only_data" # 使用默认配置: # custom_field_name = "row_kind" # transform_type = SHORT } } sink { Console { plugin_input = "append_only_data" } } ``` **数据转换过程:** ``` 输入数据(CDC 格式): 1. RowKind=+I, id=1, name="张三", age=25 2. RowKind=-U, id=1, name="张三", age=25 3. RowKind=+U, id=1, name="张三", age=26 4. RowKind=-D, id=1, name="张三", age=26 输出数据(Append-Only 格式): 1. RowKind=+I, id=1, name="张三", age=25, row_kind="+I" 2. RowKind=+I, id=1, name="张三", age=25, row_kind="-U" 3. RowKind=+I, id=1, name="张三", age=26, row_kind="+U" 4. RowKind=+I, id=1, name="张三", age=26, row_kind="-D" ``` --- ### 示例 2:使用 FULL 格式和自定义字段名 使用完整格式输出 RowKind,并自定义字段名称。 ```yaml env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "cdc_source" server-id = 5652 username = "root" password = "your_password" table-names = ["mydb.orders"] url = "jdbc:mysql://localhost:3306/mydb" } } transform { RowKindExtractor { plugin_input = "cdc_source" plugin_output = "append_only_data" custom_field_name = "operation_type" # 自定义字段名 transform_type = FULL # 使用完整格式 } } sink { Iceberg { plugin_input = "append_only_data" catalog_name = "iceberg_catalog" database = "mydb" table = "orders_history" # Iceberg 表会包含 operation_type 字段,记录每条数据的变更类型 } } ``` **数据转换过程:** ``` 输入数据(CDC 格式): 1. RowKind=+I, order_id=1001, amount=100.00 2. RowKind=-U, order_id=1001, amount=100.00 3. RowKind=+U, order_id=1001, amount=150.00 4. RowKind=-D, order_id=1001, amount=150.00 输出数据(Append-Only 格式,FULL 格式): 1. RowKind=+I, order_id=1001, amount=100.00, operation_type="INSERT" 2. RowKind=+I, order_id=1001, amount=100.00, operation_type="UPDATE_BEFORE" 3. RowKind=+I, order_id=1001, amount=150.00, operation_type="UPDATE_AFTER" 4. RowKind=+I, order_id=1001, amount=150.00, operation_type="DELETE" ``` --- ### 示例 3:完整的测试示例(使用 FakeSource) 使用 FakeSource 生成测试数据,演示各种 RowKind 的转换效果。 ```yaml env { parallelism = 1 job.mode = "BATCH" } source { FakeSource { plugin_output = "fake_cdc_data" schema = { fields { pk_id = bigint name = string score = int } primaryKey { name = "pk_id" columnNames = [pk_id] } } rows = [ { kind = INSERT fields = [1, "A", 100] }, { kind = INSERT fields = [2, "B", 100] }, { kind = UPDATE_BEFORE fields = [1, "A", 100] }, { kind = UPDATE_AFTER fields = [1, "A_updated", 95] }, { kind = UPDATE_BEFORE fields = [2, "B", 100] }, { kind = UPDATE_AFTER fields = [2, "B_updated", 98] }, { kind = DELETE fields = [1, "A_updated", 95] } ] } } transform { RowKindExtractor { plugin_input = "fake_cdc_data" plugin_output = "transformed_data" custom_field_name = "change_type" transform_type = FULL } } sink { Console { plugin_input = "transformed_data" } } ``` **预期输出:** ``` +I, pk_id=1, name="A", score=100, change_type="INSERT" +I, pk_id=2, name="B", score=100, change_type="INSERT" +I, pk_id=1, name="A", score=100, change_type="UPDATE_BEFORE" +I, pk_id=1, name="A_updated", score=95, change_type="UPDATE_AFTER" +I, pk_id=2, name="B", score=100, change_type="UPDATE_BEFORE" +I, pk_id=2, name="B_updated", score=98, change_type="UPDATE_AFTER" +I, pk_id=1, name="A_updated", score=95, change_type="DELETE" ``` ================================================ FILE: docs/zh/transforms/split.md ================================================ # 拆分 > 拆分转换插件 ## 描述 拆分一个字段为多个字段。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |---------------|--------|------|-----| | separator | string | yes | | | split_field | string | yes | | | output_fields | array | yes | | ### separator [string] 拆分内容的分隔符 ### split_field [string] 需要拆分的字段 ### output_fields [array] 拆分后的结果字段 ### common options [string] 转换插件的常见参数, 请参考 [Transform Plugin](common-options/common-options.md) 了解详情 ## 示例 源端数据读取的表格如下: | name | age | card | |----------|-----|------| | Joy Ding | 20 | 123 | | May Ding | 20 | 123 | | Kin Dom | 20 | 123 | | Joy Dom | 20 | 123 | 我们想要将 `name` 字段拆分为 `first_name` 和 `second_name`,我们可以像这样添加 `Split` 转换: ``` transform { Split { plugin_input = "fake" plugin_output = "fake1" separator = " " split_field = "name" output_fields = [first_name, second_name] } } ``` 那么结果表 `fake1` 中的数据将会像这样: | name | age | card | first_name | last_name | |----------|-----|------|------------|-----------| | Joy Ding | 20 | 123 | Joy | Ding | | May Ding | 20 | 123 | May | Ding | | Kin Dom | 20 | 123 | Kin | Dom | | Joy Dom | 20 | 123 | Joy | Dom | ## 更新日志 ### 新版本 - 添加拆分转换连接器 ================================================ FILE: docs/zh/transforms/sql-functions.md ================================================ # SQL函数 > SQL函数转换插件功能 ## 字符串函数 ### ASCII ```ASCII(string) -> INT``` 返回字符串中第一个字符的ASCII值。 示例: ASCII('Hi') ### BIT_LENGTH ```BIT_LENGTH(bytes) -> LONG``` 返回二进制字符串中的位数。 示例: BIT_LENGTH(NAME) ### CHAR_LENGTH / LENGTH ```CHAR_LENGTH | LENGTH (string) -> LONG``` 这个方法返回一个字符串中字符的数量。 示例: CHAR_LENGTH(NAME) ### OCTET_LENGTH ```OCTET_LENGTH(bytes) -> LONG``` 返回二进制字符串中字节的数量。 示例: OCTET_LENGTH(NAME) ### CHAR / CHR ```CHAR | CHR (int) -> STRING``` 返回表示ASCII值的字符。 示例: CHAR(65) ### CONCAT ```CONCAT(string, string[, string ...] ) -> STRING``` 组合字符串。与运算符 `||` 不同,**NULL** 参数会被忽略,不会导致结果变为 **NULL**。如果所有参数都是 NULL,则结果是一个空字符串。 示例: CONCAT(NAME, '_') ### CONCAT_WS ```CONCAT_WS(separatorString, string, string[, string ...] ) -> STRING``` 使用分隔符组合字符串。如果分隔符为 **NULL**,则会被视为空字符串。其他 **NULL** 参数会被忽略。剩余的 **非NULL** 参数(如果有)将用指定的分隔符连接起来。如果没有剩余参数,则结果是一个空字符串。 示例: CONCAT_WS(',', NAME, '_') ### HEXTORAW ```HEXTORAW(string) -> STRING``` 将字符串的十六进制表示转换为字符串。每个字符串字符使用4个十六进制字符。 示例: HEXTORAW(DATA) ### RAWTOHEX ```RAWTOHEX(string | bytes) -> STRING``` 将字符串或字节转换为十六进制表示。每个字符串字符使用4个十六进制字符。 示例: RAWTOHEX(DATA) ### INSERT ```INSERT(originalString, startInt, lengthInt, addString) -> STRING``` 在原始字符串的指定起始位置插入额外的字符串。长度参数指定在原始字符串的起始位置删除的字符数。 示例: INSERT(NAME, 1, 1, ' ') ### LOWER / LCASE ```LOWER | LCASE (string) -> STRING``` 将字符串转换为小写形式。 示例: LOWER(NAME) ### UPPER / UCASE ```UPPER | UCASE (string) -> STRING``` 将字符串转换为大写形式。 示例: UPPER(NAME) ### LEFT ```LEFT(string, int) -> STRING``` 返回最左边的一定数量的字符。 示例: LEFT(NAME, 3) ### RIGHT ```RIGHT(string, int) -> STRING``` 返回最右边的一定数量的字符。 示例: RIGHT(NAME, 3) ### LOCATE / INSTR / POSITION ```LOCATE(searchString, string[, startInt]) -> INT``` ```INSTR(string, searchString[, startInt]) -> INT``` ```POSITION(searchString, string) -> INT``` 返回字符串中搜索字符串的位置。如果使用了起始位置参数,则忽略它之前的字符。如果位置参数是负数,则返回最右边的位置。如果未找到搜索字符串,则返回 0。请注意,即使参数不区分大小写,此函数也区分大小写。 示例: LOCATE('.', NAME) ### LPAD ```LPAD(string ,int[, string]) -> STRING``` 将字符串左侧填充到指定的长度。如果长度比字符串短,则字符串将在末尾被截断。如果未设置填充字符串,则使用空格填充。 示例: LPAD(AMOUNT, 10, '*') ### RPAD ```RPAD(string, int[, string]) -> STRING``` 将字符串右侧填充到指定的长度。如果长度比字符串短,则字符串将被截断。如果未设置填充字符串,则使用空格填充。 示例: RPAD(TEXT, 10, '-') ### LTRIM ```LTRIM(string[, characterToTrimString]) -> STRING``` 移除字符串中所有前导空格或其他指定的字符。 示例: LTRIM(NAME) ### RTRIM ```RTRIM(string[, characterToTrimString]) -> STRING``` 移除字符串中所有尾随空格或其他指定的字符。 示例: RTRIM(NAME) ### TRIM ```TRIM(string[, characterToTrimString]) -> STRING``` 移除字符串中所有前导空格和尾随空格或其他指定的字符。 示例: TRIM(NAME) ### REGEXP_REPLACE ```REGEXP_REPLACE(inputString, regexString, replacementString[, flagsString]) -> STRING``` 替换与正则表达式匹配的每个子字符串。详情请参阅 Java String.replaceAll() 方法。如果任何参数为 null(除了可选的 flagsString 参数),则结果为 null。 标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'n' 允许句点匹配换行符(Pattern.DOTALL) 'm' 启用多行模式(Pattern.MULTILINE) 示例: REGEXP_REPLACE('Hello World', ' +', ' ') REGEXP_REPLACE('Hello WWWWorld', 'w+', 'W', 'i') ### REGEXP_LIKE ```REGEXP_LIKE(inputString, regexString[, flagsString]) -> BOOLEAN``` 将字符串与正则表达式匹配。详情请参阅 Java Matcher.find() 方法。如果任何参数为 null(除了可选的 flagsString 参数),则结果为 null。 标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'n' 允许句点匹配换行符(Pattern.DOTALL) 'm' 启用多行模式(Pattern.MULTILINE) 示例: REGEXP_LIKE('Hello World', '[A-Z ]*', 'i') ### REGEXP_SUBSTR ```REGEXP_SUBSTR(inputString, regexString[, positionInt, occurrenceInt, flagsString, groupInt]) -> STRING``` 将字符串与正则表达式匹配,并返回匹配的子字符串。详情请参阅 java.util.regex.Pattern 和相关功能。 参数 position 指定匹配应该从 inputString 的哪里开始。Occurrence 指示在 inputString 中搜索 pattern 的哪个出现。 标志值限于 'i'、'c'、'n'、'm'。其他符号会引发异常。可以在一个 flagsString 参数中使用多个符号(例如 'im')。后面的标志会覆盖前面的标志,例如 'ic' 等同于区分大小写匹配 'c'。 'i' 启用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'c' 禁用不区分大小写匹配(Pattern.CASE_INSENSITIVE) 'n' 允许句点匹配换行符(Pattern.DOTALL) 'm' 启用多行模式(Pattern.MULTILINE) 如果模式具有组,则可以使用 group 参数指定要返回的组。 示例: REGEXP_SUBSTR('2020-10-01', '\d{4}') REGEXP_SUBSTR('2020-10-01', '(\d{4})-(\d{2})-(\d{2})', 1, 1, NULL, 2) ### REPEAT ```REPEAT(string, int) -> STRING``` 将字符串按指定次数重复后返回。 示例: REPEAT(NAME || ' ', 10) ### REPLACE ```REPLACE(string, searchString[, replacementString]) -> STRING``` 在文本中替换所有出现的搜索字符串为另一个字符串。如果没有指定替换字符串,则从原始字符串中移除搜索字符串。如果任何参数为 null,则结果为 null。 示例: REPLACE(NAME, ' ') ### SPLIT ```SPLIT(string, delimiterString) -> ARRAY``` 将字符串切分成数组。 示例: select SPLIT(test,';') as arrays ### MURMUR64 ```MURMUR64(string) -> LONG``` 计算输入字符串的 MurmurHash 128 哈希值,并返回低 64 位作为长整型值。MurmurHash 是一种非加密哈希函数,适用于一般的基于哈希的查找。此方法返回一个长整型值,如果输入参数为 null,则返回 null。 示例: MURMUR64('hello world') MURMUR64(NAME) ### SOUNDEX ```SOUNDEX(string) -> STRING``` 表示字符串发音。此方法返回一个字符串,如果参数为 null,则返回 null。有关更多信息,请参阅 https://en.wikipedia.org/wiki/Soundex 。 示例: SOUNDEX(NAME) ### SPACE ```SPACE(int) -> STRING``` 返回由一定数量的空格组成的字符串。 示例: SPACE(80) ### SUBSTRING / SUBSTR ```SUBSTRING | SUBSTR (string, startInt[, lengthInt ]) -> STRING``` 返回从指定位置开始的字符串的子串。如果起始索引为负数,则相对于字符串的末尾计算起始索引。长度是可选的。 示例: CALL SUBSTRING('[Hello]', 2); CALL SUBSTRING('hour', 3, 2); ### TO_CHAR ```TO_CHAR(value[, formatString]) -> STRING``` Oracle 兼容的 TO_CHAR 函数可用于格式化时间戳、数字或文本。 示例: CALL TO_CHAR(SYS_TIME, 'yyyy-MM-dd HH:mm:ss') ### TRANSLATE ```TRANSLATE(value, searchString, replacementString) -> STRING``` Oracle 兼容的 TRANSLATE 函数用于将字符串中的一系列字符替换为另一组字符。 示例: CALL TRANSLATE('Hello world', 'eo', 'EO') ## Numeric Functions ### ABS ```ABS(numeric) -> NUMERIC (same type)``` 返回指定值的绝对值。返回的值与参数的数据类型相同。 请注意,TINYINT、SMALLINT、INT 和 BIGINT 数据类型无法表示它们的最小负值的绝对值,因为它们的负值比正值多。例如,对于 INT 数据类型,允许的值范围是从 -2147483648 到 2147483647。ABS(-2147483648) 应该是 2147483648,但是这个值对于这个数据类型是不允许的。这会导致异常。为了避免这种情况,请将此函数的参数转换为更高的数据类型。 示例: ABS(I) ### ACOS ```ACOS(numeric) -> DOUBLE``` 计算反余弦值。另请参阅 Java Math.acos。 示例: ACOS(D) ### ARRAY_MAX ```ARRAY_MAX(ARRAY) -> type(array element)``` MAX 函数返回表达式的最大值。 示例: ARRAY_MAX(I) ### ARRAY_MIN ```ARRAY_MIN(ARRAY) -> type(array element)``` MIN 函数返回表达式的最小值。 示例: ARRAY_MIN(I) ### ASIN ```ASIN(numeric) -> DOUBLE``` 计算反正弦值。另请参阅 Java Math.asin。 示例: ASIN(D) ### ATAN ```ATAN(numeric) -> DOUBLE``` 计算反正切值。另请参阅 Java Math.atan。 示例: ATAN(D) ### COS ```COS(numeric) -> DOUBLE``` 计算三角余弦值。另请参阅 Java Math.cos。 示例: COS(ANGLE) ### COSH ```COSH(numeric) -> DOUBLE``` 计算双曲余弦值。另请参阅 Java Math.cosh。 示例: COSH(X) ### COT ```COT(numeric) -> DOUBLE``` 计算三角余切值(1/TAN(角度))。另请参阅 Java Math.* 函数。 示例: COT(ANGLE) ### SIN ```SIN(numeric) -> DOUBLE``` 计算三角正弦值。另请参阅 Java Math.sin。 示例: SIN(ANGLE) ### SINH ```SINH(numeric) -> DOUBLE``` 计算双曲正弦值。另请参阅 Java Math.sinh。 示例: SINH(ANGLE) ### TAN ```TAN(numeric) -> DOUBLE``` 计算三角正切值。另请参阅 Java Math.tan。 示例: TAN(ANGLE) ### TANH ```TANH(numeric) -> DOUBLE``` 计算双曲正切值。另请参阅 Java Math.tanh。 示例: TANH(X) ### MOD ```MOD(dividendNumeric, divisorNumeric ) -> type(divisorNumeric)``` 取模运算表达式。 结果与除数的类型相同。如果任一参数为 NULL,则结果为 NULL。如果除数为 0,则会引发异常。结果与被除数的符号相同,或者等于 0。 通常情况下,参数应具有标度 0,但 H2 并不要求。 示例: MOD(A, B) ### CEIL / CEILING ```CEIL | CEILING (numeric) -> NUMERIC (same type, scale 0)``` 返回大于或等于参数的最小整数值。该方法返回与参数相同类型的值,但标度设置为 0,并且如果适用,则调整精度。 示例: CEIL(A) ### EXP ```EXP(numeric) -> DOUBLE``` 请参阅 Java Math.exp。 示例: EXP(A) ### FLOOR ```FLOOR(numeric) -> NUMERIC (same type, scale 0)``` 返回小于或等于参数的最大整数值。该方法返回与参数相同类型的值,但标度设置为 0,并且如果适用,则调整精度。 示例: FLOOR(A) ### LN ```LN(numeric) -> DOUBLE``` 计算自然对数(以 e 为底)的双精度浮点数值。参数必须是一个正数值。 示例: LN(A) ### LOG ```LOG(baseNumeric, numeric) -> DOUBLE``` 计算以指定底数的对数,返回一个双精度浮点数。参数和底数必须是正数值。底数不能等于1。 默认底数是 e(自然对数),在 PostgreSQL 模式下,默认底数是 10。在 MSSQLServer 模式下,可选的底数在参数之后指定。 LOG 函数的单参数变体已被弃用,请使用 LN 或 LOG10 替代。 示例: LOG(2, A) ### LOG10 ```LOG10(numeric) -> DOUBLE``` 计算以 10 为底的对数,返回一个双精度浮点数。参数必须是一个正数值。 示例: LOG10(A) ### RADIANS ```RADIANS(numeric) -> DOUBLE``` 请参阅 Java Math.toRadians。 示例: RADIANS(A) ### SQRT ```SQRT(numeric) -> DOUBLE``` 请参阅 Java Math.sqrt。 示例: SQRT(A) ### PI ```PI() -> DOUBLE``` 请参阅 Java Math.PI。 示例: PI() ### POWER ```POWER(numeric, numeric) -> DOUBLE``` 请参阅 Java Math.pow。 示例: POWER(A, B) ### RAND / RANDOM ```RAND | RANDOM([ int ]) -> DOUBLE``` 如果不带参数调用该函数,则返回下一个伪随机数。如果带有参数调用,则将会给该会话的随机数生成器设定种子。该方法返回一个介于 0(包括)和 1(不包括)之间的双精度浮点数。 示例: RAND() ### ROUND ```ROUND(numeric[, digitsInt]) -> NUMERIC (same type)``` 四舍五入到指定的小数位数。该方法返回与参数相同类型的值,但如果适用,则调整精度和标度。 示例: ROUND(N, 2) ### SIGN ```SIGN(numeric) -> INT``` 如果值小于 0,则返回 -1;如果值为零或 NaN,则返回 0;否则返回 1。 示例: SIGN(N) ### TRUNC ```TRUNC | TRUNCATE(numeric[, digitsInt]) -> NUMERIC (same type)``` 当指定了一个数值参数时,将其截断为指定的数字位数(接近0的下一个值),并返回与参数相同类型的值,但如果适用,则调整精度和标度。 示例: TRUNC(N, 2) ### TRIM_SCALE ```TRIM_SCALE(numeric) -> NUMERIC (same type)``` 通过删除尾数部分的零来降低值的刻度(小数位数),并调整小数位数。 示例: TRIM_SCALE(N) ## Time and Date Functions ### CURRENT_DATE ```CURRENT_DATE [()] -> DATE``` 返回当前日期。 这些函数在事务(默认)或命令内部返回相同的值,具体取决于数据库模式。 示例: CURRENT_DATE ### CURRENT_TIME ```CURRENT_TIME [()] -> TIME``` 返回带有系统时区的当前时间。实际可用的最大精度取决于操作系统和 JVM,可以是 3(毫秒)或更高。在 Java 9 之前不支持更高的精度。 示例: CURRENT_TIME ### CURRENT_TIMESTAMP / NOW ```CURRENT_TIMESTAMP[()] | NOW() -> TIMESTAMP``` 返回带有系统时区的当前时间戳。实际可用的最大精度取决于操作系统和 JVM,可以是 3(毫秒)或更高。在 Java 9 之前不支持更高的精度。 示例: CURRENT_TIMESTAMP ### DATEADD / TIMESTAMPADD ```DATEADD | TIMESTAMPADD(dateAndTime, addIntLong, datetimeFieldString) -> type(dateAndTime)``` 将单位添加到日期时间值中。datetimeFieldString 表示单位。使用负值来减去单位。当操作毫秒、微秒或纳秒时,addIntLong 可能是一个 long 值,否则其范围被限制为 int。如果单位与指定值兼容,则此方法返回与指定值相同类型的值。如果指定的字段是 HOUR、MINUTE、SECOND、MILLISECOND 等,而值是 DATE 值,DATEADD 返回组合的 TIMESTAMP。对于 TIME 值,不允许使用 DAY、MONTH、YEAR、WEEK 等字段。 示例: DATEADD(CREATED, 1, 'MONTH') ### DATEDIFF ```DATEDIFF(aDateAndTime, bDateAndTime, datetimeFieldString) -> LONG``` 返回两个日期时间值之间跨越的单位边界数。datetimeField 表示单位。 示例: DATEDIFF(T1.CREATED, T2.CREATED, 'MONTH') ### DATE_TRUNC ```DATE_TRUNC (dateAndTime, datetimeFieldString) -> dateAndTime (same type)``` 将指定的日期时间值截断到指定的字段。 示例: DATE_TRUNC(CREATED, 'DAY'); ### DAYNAME ```DAYNAME(dateAndTime) -> STRING``` 返回星期几的名称(英文)。 示例: DAYNAME(CREATED) ### DAY_OF_MONTH ```DAY_OF_MONTH(dateAndTime) -> INT``` 返回月份中的日期(1-31)。 示例: DAY_OF_MONTH(CREATED) ### DAY_OF_WEEK ```DAY_OF_WEEK(dateAndTime) -> INT``` 返回星期几的数值(1-7)(星期一至星期日),根据本地化设置。 示例: DAY_OF_WEEK(CREATED) ### DAY_OF_YEAR ```DAY_OF_YEAR(dateAndTime) -> INT``` 返回一年中的日期(1-366)。 示例: DAY_OF_YEAR(CREATED) ### EXTRACT ```EXTRACT ( datetimeField FROM dateAndTime) -> INT | NUMERIC``` 从日期/时间值中返回特定时间单位的值。该方法对于 EPOCH 字段返回一个数值,对于其他字段返回一个整数。 EXTRACT函数支持以下字段名: - `CENTURY`:世纪;对于interval值,年份字段除以100 - `DAY`:月份中的日期(1-31);对于interval值,表示天数 - `DECADE`:年份字段除以10 - `DOW` 或 `DAYOFWEEK`:星期几,从周日(0)到周六(6) - `DOY`:一年中的第几天(1-365/366) - `EPOCH`:对于timestamp值,表示自1970-01-01 00:00:00以来的秒数;对于interval值,表示总秒数 - `HOUR`:小时字段(0-23) - `ISODOW`:星期几,从周一(1)到周日(7),符合ISO 8601标准 - `ISOYEAR`:ISO 8601周编号年份 - `MICROSECONDS`:秒字段(包括小数部分)乘以1,000,000 - `MILLENNIUM`:千年;对于interval值,年份字段除以1000 - `MILLISECONDS`:秒字段(包括小数部分)乘以1,000 - `MINUTE`:分钟字段(0-59) - `MONTH`:年份中的月份(1-12);对于interval值,月份对12取模(0-11) - `QUARTER`:日期所在的季度(1-4) - `SECOND`:秒字段,包括任何小数秒 - `WEEK`:ISO 8601周编号年份中的周数(1-53) - `YEAR`:年份字段 EXTRACT函数支持以下四种DateTime字面量类型: - `DATE`:用于从日期字面量中提取日期组件 ```sql EXTRACT(YEAR FROM DATE '2025-05-21') ``` - `TIME`:用于从时间字面量中提取时间组件 ```sql EXTRACT(HOUR FROM TIME '17:57:40') ``` - `TIMESTAMP`:用于从时间戳字面量中提取日期和时间组件 ```sql EXTRACT(YEAR FROM TIMESTAMP '2025-05-21T17:57:40') ``` - `TIMESTAMP WITH TIMEZONE`:用于从带时区的时间戳字面量中提取组件 ```sql EXTRACT(HOUR FROM TIMESTAMPTZ '2025-05-21T17:57:40+08:00') ``` 示例: ```sql EXTRACT(YEAR FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(HOUR FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(DOW FROM TIMESTAMP '2001-02-16 20:38:40') EXTRACT(YEAR FROM eventTime) EXTRACT(HOUR FROM eventTime) EXTRACT(DOW FROM eventTime) ``` ### FORMATDATETIME ```FORMATDATETIME (dateAndTime, formatString) -> STRING``` 将日期、时间或时间戳格式化为字符串。最重要的格式字符包括:y(年)、M(月)、d(日)、H(时)、m(分)、s(秒)。有关格式的详细信息,请参阅 java.time.format.DateTimeFormatter。 示例: CALL FORMATDATETIME(CREATED, 'yyyy-MM-dd HH:mm:ss') ### HOUR ```HOUR(dateAndTime) -> INT``` 从日期/时间值中返回小时(0-23)。 示例: HOUR(CREATED) ### MINUTE ```MINUTE(dateAndTime) -> INT``` 从日期/时间值中返回分钟(0-59)。 该函数已经被弃用,请使用 EXTRACT 替代。 示例: MINUTE(CREATED) ### MONTH ```MONTH(dateAndTime) -> INT``` 从日期/时间值中返回月份(1-12)。 该函数已经被弃用,请使用 EXTRACT 替代。 示例: MONTH(CREATED) ### MONTHNAME ```MONTHNAME(dateAndTime) -> STRING``` 返回月份的名称(英文)。 示例: MONTHNAME(CREATED) ### IS_DATE ```IS_DATE(string, formatString) -> BOOLEAN``` 验证字符串是否可以使用指定的格式模式解析为日期/时间值。 **支持的格式模式:** 日期时间格式: - `yyyy-MM-dd HH:mm:ss` - 标准日期时间格式 - `yyyy-MM-dd HH:mm:ss.SSS` - 带毫秒的日期时间 - `yyyy-MM-dd'T'HH:mm:ss` - ISO 8601 日期时间格式 - `yyyy-MM-dd'T'HH:mm:ss.SSS` - 带毫秒的 ISO 8601 日期时间 - `yyyy/MM/dd HH:mm:ss` - 带斜杠分隔符的日期时间 - `yyyy/MM/dd HH:mm:ss.SSS` - 带斜杠分隔符和毫秒的日期时间 - `yyyyMMddHHmmss` - 紧凑日期时间格式 日期格式: - `yyyy-MM-dd` - ISO 8601 日期格式 - `yyyy/MM/dd` - 带斜杠分隔符的日期 - `yyyyMMdd` - 紧凑日期格式 时间格式: - `HH:mm:ss` - 标准时间格式 - `HH:mm:ss.SSS` - 带毫秒的时间 - `HHmmss` - 紧凑时间格式 示例: ```sql CALL IS_DATE('2021-04-08 13:34:45', 'yyyy-MM-dd HH:mm:ss') -- 返回 true CALL IS_DATE('2021/04/08', 'yyyy/MM/dd') -- 返回 true CALL IS_DATE('20210408', 'yyyyMMdd') -- 返回 true -- 与 TO_DATE 保持一致 SELECT CASE WHEN IS_DATE(date_string, 'yyyy-MM-dd HH:mm:ss') THEN TO_DATE(date_string, 'yyyy-MM-dd HH:mm:ss') ELSE NULL END as parsed_date ``` ### PARSEDATETIME / TO_DATE ```PARSEDATETIME | TO_DATE(string, formatString) -> TIMESTAMP | DATE | TIME``` 使用指定的格式模式将字符串解析为日期/时间值 **支持的格式模式:** 日期时间格式 (返回 TIMESTAMP): - `yyyy-MM-dd HH:mm:ss` - 标准日期时间格式 - `yyyy-MM-dd HH:mm:ss.SSS` - 带毫秒的日期时间 - `yyyy-MM-dd'T'HH:mm:ss` - ISO 8601 日期时间格式 - `yyyy-MM-dd'T'HH:mm:ss.SSS` - 带毫秒的 ISO 8601 日期时间 - `yyyy/MM/dd HH:mm:ss` - 带斜杠分隔符的日期时间 - `yyyy/MM/dd HH:mm:ss.SSS` - 带斜杠分隔符和毫秒的日期时间 - `yyyyMMddHHmmss` - 紧凑日期时间格式 日期格式 (返回 DATE): - `yyyy-MM-dd` - ISO 8601 日期格式 - `yyyy/MM/dd` - 带斜杠分隔符的日期 - `yyyyMMdd` - 紧凑日期格式 时间格式 (返回 TIME): - `HH:mm:ss` - 标准时间格式 - `HH:mm:ss.SSS` - 带毫秒的时间 - `HHmmss` - 紧凑时间格式 **注意:** 在格式模式中使用单引号 (`'`) 时(例如 ISO 8601 的 'T' 分隔符),必须在 SQL 中转义为 `''`。 示例: ```sql -- 日期时间示例 CALL PARSEDATETIME('2021-04-08 13:34:45', 'yyyy-MM-dd HH:mm:ss') CALL TO_DATE('2021-04-08T13:34:45', 'yyyy-MM-dd''T''HH:mm:ss') CALL PARSEDATETIME('2024-06-15 14:30:45.123', 'yyyy-MM-dd HH:mm:ss.SSS') CALL PARSEDATETIME('2021/04/08 13:34:45', 'yyyy/MM/dd HH:mm:ss') CALL PARSEDATETIME('20210408133445', 'yyyyMMddHHmmss') -- 日期示例 CALL TO_DATE('2021-04-08', 'yyyy-MM-dd') CALL TO_DATE('2021/04/08', 'yyyy/MM/dd') CALL TO_DATE('20210408', 'yyyyMMdd') -- 时间示例 CALL PARSEDATETIME('14:30:45', 'HH:mm:ss') CALL PARSEDATETIME('14:30:45.123', 'HH:mm:ss.SSS') CALL PARSEDATETIME('143045', 'HHmmss') ``` ### QUARTER ```QUARTER(dateAndTime) -> INT``` 从日期/时间值中返回季度(1-4)。 示例: QUARTER(CREATED) ### SECOND ```SECOND(dateAndTime) -> INT``` 从日期/时间值中返回秒数(0-59)。 该函数已经被弃用,请使用 EXTRACT 替代。 示例: SECOND(CREATED) ### WEEK ```WEEK(dateAndTime) -> INT``` 返回日期/时间值中的周数(1-53)。 该函数使用当前系统的区域设置。 示例: WEEK(CREATED) ### YEAR ```YEAR(dateAndTime) -> INT``` 返回日期/时间值中的年份。 示例: YEAR(CREATED) ### FROM_UNIXTIME ```FROM_UNIXTIME (unixtime, formatString,timeZone) -> STRING``` 将从 UNIX 纪元(1970-01-01 00:00:00 UTC)开始的秒数转换为表示该时刻时间戳的字符串。 最重要的格式字符包括:y(年)、M(月)、d(日)、H(时)、m(分)、s(秒)。有关格式的详细信息,请参阅 `java.time.format.DateTimeFormatter`。 `timeZone` 是可选的,默认值为系统的时区。`timezone` 的值可以是一个 `UTC+ 时区偏移`,例如,`UTC+8` 表示亚洲/上海时区,请参阅 https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 。 示例: // 使用默认时区 CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss') or // 使用指定时区 CALL FROM_UNIXTIME(1672502400, 'yyyy-MM-dd HH:mm:ss','UTC+6') ### AT TIME ZONE ```dateAndTime AT TIME ZONE 'timeZone' -> TIMESTAMP_TZ``` 转换一个时间戳值为指定时区的带时区时间戳值。 `timezone` 的值可以是一个 `UTC+ 时区偏移`,例如,`+08:00` 表示亚洲/上海时区,请参阅 https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 。 Example: local_date_time AT TIME ZONE '+09:00' offset_date_time AT TIME ZONE 'Pacific/Honolulu' ## System Functions ### CAST ```CAST(value as dataType) -> dataType``` 将一个值转换为另一个数据类型。 支持的数据类型有:STRING | VARCHAR,TINYINT,SMALLINT,INT | INTEGER,LONG | BIGINT,BYTE,FLOAT,DOUBLE,DECIMAL(p,s),TIMESTAMP,DATE,TIME,BYTES 示例: CAST(NAME AS INT) CAST(FLAG AS BOOLEAN) 注意:将值转换为布尔数据类型时,遵循以下规则: 1. 如果值可以被解释为布尔字符串('true' 或 'false'),则返回相应的布尔值。 2. 如果值可以被解释为数值(1 或 0),则对于 1 返回 true,对于 0 返回 false。 3. 如果值无法根据以上规则进行解释,则抛出 TransformException 异常。 ### TRY_CAST ```TRY_CAST(value as dataType) -> dataType | NULL``` 该函数类似于 CAST,但当转换失败时,它返回 NULL 而不是抛出异常。 支持的数据类型有:STRING | VARCHAR,TINYINT,SMALLINT,INT | INTEGER,LONG | BIGINT,BYTE,FLOAT,DOUBLE,DECIMAL(p,s),TIMESTAMP,DATE,TIME,BYTES 示例: TRY_CAST(NAME AS INT) ### COALESCE ```COALESCE(aValue, bValue [,...]) -> type(of first non-null arg)``` 返回第一个非空值。如果后续参数与第一个参数的数据类型不同,则会自动转换为第一个参数的类型。 示例: COALESCE(A, B, C) 类型转换示例: ``` -- 如果A是字符串类型而B是整数类型 -- 当A为空时,B会被转换为字符串类型 SELECT COALESCE(A, B) as result FROM my_table ``` ### IFNULL ```IFNULL(aValue, bValue) -> type(common of args)``` 返回第一个非空值。如果后续参数与第一个参数的数据类型不同,则会自动转换为第一个参数的类型。 示例: IFNULL(A, B) ### NULLIF ```NULLIF(aValue, bValue) -> type(aValue) | NULL``` 如果 'a' 等于 'b',则返回 NULL,否则返回 'a'。 示例: NULLIF(A, B) ### MULTI_IF ```MULTI_IF(condition1, value1, condition2, value2, ... conditionN, valueN, bValue) -> type(of values)``` 返回第一个满足相应条件的值。如果所有条件均为假,则返回最后一个值。 示例: MULTI_IF(A > 1, 'A', B > 1, 'B', C > 1, 'C', 'D') ### CASE WHEN ```CASE WHEN THEN [WHEN ...] [ELSE ] END -> type(of result expressions)``` ``` select case when c_string in ('c_string') then 1 else 0 end as c_string_1, case when c_string not in ('c_string') then 1 else 0 end as c_string_0, case when c_tinyint = 117 and TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_1, case when c_tinyint != 117 and TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_0, case when c_tinyint != 117 or TO_CHAR(c_boolean) = 'true' then 1 else 0 end as c_tinyint_boolean_or_1, case when c_int > 1 and c_bigint > 1 and c_float > 1 and c_double > 1 and c_decimal > 1 then 1 else 0 end as c_number_1, case when c_tinyint <> 117 then 1 else 0 end as c_number_0 from fake ``` 用于确定条件是否有效,并根据不同的判断返回不同的值 示例: case when c_string in ('c_string') then 1 else 0 end case when c_string in ('c_string') then true else false end ### UUID ```UUID() -> STRING``` 通过java函数生成uuid 示例: select UUID() as seatunnel_uuid ### ARRAY ```ARRAY array(T, ...) -> ARRAY``` 创建一个由可变参数元素组成的数组并返回它。这里,T 可以是“列”或“常量”。 示例: select ARRAY(1,2,3) as arrays select ARRAY('c_1',2,3.12) as arrays select ARRAY(column1,column2,column3) as arrays 注意:目前仅支持string、double、long、int几种类型 ### LATERAL VIEW #### EXPLODE ```EXPLODE(array of T) -> rows(value: T)``` ```OUTER EXPLODE(array of T) -> rows(value: T | NULL)``` 用于将数组列展开成多行。它通过对数组应用 EXPLODE 函数,为数组中的每个元素生成一个新行。 EXPLODE:将数组列转换为多行。如果数组为 NULL 或为空,则不生成行。 OUTER EXPLODE:当数组为 NULL 或为空时返回 NULL,确保至少生成一行。 EXPLODE(SPLIT(字段名, 分隔符)):使用指定的分隔符将字符串拆分为数组,然后将其展开为多行。 EXPLODE(ARRAY(值1, 值2, ...)):将自定义数组展开为多行。 示例: ``` SELECT * FROM dual LATERAL VIEW EXPLODE ( SPLIT ( NAME, ',' ) ) AS NAME LATERAL VIEW EXPLODE ( SPLIT ( pk_id, ';' ) ) AS pk_id LATERAL VIEW OUTER EXPLODE ( age ) AS age LATERAL VIEW OUTER EXPLODE ( ARRAY(1,1) ) AS num ``` ## 向量函数 ### VECTOR_DIMS ```VECTOR_DIMS(vector) -> INT``` 返回一个INT值,表示向量中的维数(元素)。 示例: VECTOR_DIMS(vector) ### VECTOR_NORM ```VECTOR_NORM(vector) -> DOUBLE``` 计算向量的L2范数(欧几里得范数),表示向量的长度或大小。 示例: VECTOR_NORM(vector) ### INNER_PRODUCT ```INNER_PRODUCT(vector1, vector2) -> DOUBLE``` 计算两个向量的内积(点积),用于测量向量之间的相似性和投影。 示例: INNER_PRODUCT(vector1, vector2) ### COSINE_DISTANCE ```COSINE_DISTANCE(vector1, vector2) -> DOUBLE``` 返回介于 0 和 1 之间的 DOUBLE 值: 0:相同的向量(完全相似) 1:正交向量(完全不同) 示例: COSINE_DISTANCE(vector1, vector2) ### L1_DISTANCE ```L1_DISTANCE(vector1, vector2) -> DOUBLE``` 计算两个向量之间的曼哈顿(L1)距离。 示例: L1_DISTANCE(vector1, vector2) ### L2_DISTANCE ```L2_DISTANCE(vector1, vector2) -> DOUBLE``` 计算两个向量之间的欧几里得(L2)距离。 示例: L2_DISTANCE(vector1, vector2) ### VECTOR_REDUCE ```VECTOR_REDUCE(vector_field, target_dimension, method)``` 通用向量降维函数,支持多种降维方法。 **参数:** - `vector_field`: 要降维的向量字段 (VECTOR 类型) - `target_dimension`: 目标维度 (INTEGER,必须小于源维度) - `method`: 降维方法 (STRING): - **'TRUNCATE'**: 截断法,通过保留前N个元素来缩减向量维度。这是最简单、最快速的降维方法,但可能会丢失被截断维度中的重要信息。 - **'RANDOM_PROJECTION'**: 随机投影法,使用高斯随机投影和正态分布的随机矩阵。该方法在降维的同时保持向量间的相对距离,遵循Johnson-Lindenstrauss引理。 - **'SPARSE_RANDOM_PROJECTION'**: 稀疏随机投影法,矩阵元素大多为零(±√3, 0)。比常规随机投影在计算上更高效,同时保持相似的距离保持特性。 **返回值:** 降维后的 VECTOR 类型 **示例:** ```sql SELECT id, VECTOR_REDUCE(embedding, 256, 'TRUNCATE') as reduced_embedding FROM table SELECT id, VECTOR_REDUCE(embedding, 128, 'RANDOM_PROJECTION') as reduced_embedding FROM table SELECT id, VECTOR_REDUCE(embedding, 64, 'SPARSE_RANDOM_PROJECTION') as reduced_embedding FROM table ``` ### VECTOR_NORMALIZE ```VECTOR_NORMALIZE(vector_field)``` 将向量归一化为单位长度(模长 = 1)。这对于计算余弦相似度很有用。 **参数:** - `vector_field`: 要归一化的向量字段 (VECTOR 类型) **返回值:** VECTOR 类型 - 归一化后的向量 **示例:** ```sql SELECT id, VECTOR_NORMALIZE(embedding) as normalized_embedding FROM table ``` ================================================ FILE: docs/zh/transforms/sql-udf.md ================================================ # SQL用户定义函数 > SQL 转换插件的用户定义函数 (UDF) ## 描述 使用UDF SPI扩展SQL转换函数库。 ## UDF API ```java package org.apache.seatunnel.transform.sql.zeta; public interface ZetaUDF { /** * Function name * * @return function name */ String functionName(); /** * The type of function result * * @param argsType input arguments type * @return result type */ SeaTunnelDataType resultType(List> argsType); /** * Evaluate * * @param args input arguments * @return result value */ Object evaluate(List args); /** * 是否需要行级上下文。 */ default boolean requiresContext() { return false; } /** * 带上下文执行。 */ default Object evaluateWithContext(List args, ZetaUDFContext context) { return evaluate(args); } /** * 初始化 UDF 资源。 */ default void open() throws Exception {} /** * 释放 UDF 资源。 */ default void close() {} } ``` `ZetaUDFContext` 提供运行时行级元数据与字段: - `getRawTableId()` - `getDatabase()` - `getSchema()` - `getTable()` - `getRowKind()` - `getAllFields()` 说明: - `database/schema/table` 的解析语义与 `TablePath.of(tableId)` 保持一致。 - 如果 `tableId` 格式不被支持,访问 `database/schema/table` 时会抛出 `IllegalArgumentException`。 - 已有 UDF 保持向后兼容,仍可只实现 `evaluate(List args)`。 ## UDF 实现示例 将这些依赖项添加到您的 Maven 项目,并使用 provided 作用域。**依赖版本应与运行环境一致。** ```xml org.apache.seatunnel seatunnel-transforms-v2 ${seatunnel.version} provided org.apache.seatunnel seatunnel-api ${seatunnel.version} provided com.google.auto.service auto-service 1.0.1 provided ``` 添加一个 Java 类来实现 ZetaUDF,类似于以下的方式: ```java @AutoService(ZetaUDF.class) public class ExampleUDF implements ZetaUDF { @Override public String functionName() { return "EXAMPLE"; } @Override public SeaTunnelDataType resultType(List> argsType) { return BasicType.STRING_TYPE; } @Override public Object evaluate(List args) { String arg = (String) args.get(0); if (arg == null) return null; return "UDF: " + arg; } } ``` 打包UDF项目并将jar文件复制到路径:${SEATUNNEL_HOME}/lib ## 支持上下文与生命周期的 UDF 示例 ```java @AutoService(ZetaUDF.class) public class ContextLifecycleUdf implements ZetaUDF { private transient String prefix; @Override public String functionName() { return "CTX_LIFE"; } @Override public SeaTunnelDataType resultType(List> argsType) { return BasicType.STRING_TYPE; } @Override public boolean requiresContext() { return true; } @Override public void open() { this.prefix = "OPENED"; } @Override public Object evaluateWithContext(List args, ZetaUDFContext context) { String arg = args.get(0) == null ? null : String.valueOf(args.get(0)); if (arg == null) { return null; } return prefix + ":" + context.getRowKind().shortString() + ":" + arg; } @Override public void close() { this.prefix = null; } } ``` ## 示例 源端数据读取的表格如下: | id | name | age | |----|----------|-----| | 1 | Joy Ding | 20 | | 2 | May Ding | 21 | | 3 | Kin Dom | 24 | | 4 | Joy Dom | 22 | 我们使用SQL查询中的UDF来转换源数据,类似于以下方式: ``` transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, example(name) as name, age from dual" } } ``` 那么结果表 `fake1` 中的数据将会更新为 | id | name | age | |----|---------------|-----| | 1 | UDF: Joy Ding | 20 | | 2 | UDF: May Ding | 21 | | 3 | UDF: Kin Dom | 24 | | 4 | UDF: Joy Dom | 22 | ## 更新日志 ### 新版本 - 添加SQL转换连接器的UDF ================================================ FILE: docs/zh/transforms/sql.md ================================================ # SQL > SQL 转换插件 ## 描述 使用 SQL 来转换给定的输入行。 SQL 转换使用内存中的 SQL 引擎,我们可以通过 SQL 函数和 SQL 引擎的能力来实现转换任务。 ## 属性 | 名称 | 类型 | 是否必须 | 默认值 | |-------------------|--------|------|-----| | plugin_input | string | yes | - | | plugin_output | string | yes | - | | query | string | yes | - | ### plugin_input [string] 源表名称,查询 SQL 表名称必须与此字段匹配。 ### query [string] 查询 SQL,它是一个简单的 SQL,支持基本的函数和条件过滤操作。但是,复杂的 SQL 尚不支持,包括:多源表/行连接和聚合操作等。 查询表达式可以是`select [table_name.]column_a`,这时会去查询列为`column_a`的列,`table_name`为可选项 也可以是`select c_row.c_inner_row.column_b`,这时会去查询列`c_row`下的`c_inner_row`的`column_b`。**嵌套结构查询中,不能存在`table_name`** ## 示例 源端数据读取的表格如下: | id | name | age | |----|----------|-----| | 1 | Joy Ding | 20 | | 2 | May Ding | 21 | | 3 | Kin Dom | 24 | | 4 | Joy Dom | 22 | 我们使用 SQL 查询来转换源数据,类似这样: ``` transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, concat(name, '_') as name, age+1 as age from dual where id>0" } } ``` 那么结果表 `fake1` 中的数据将会更新为: | id | name | age | |----|-----------|-----| | 1 | Joy Ding_ | 21 | | 2 | May Ding_ | 22 | | 3 | Kin Dom_ | 25 | | 4 | Joy Dom_ | 23 | ### 嵌套结构查询 例如你的上游数据结构是这样: ```hacon source { FakeSource { plugin_output = "fake" row.num = 100 string.template = ["innerQuery"] schema = { fields { name = "string" c_date = "date" c_row = { c_inner_row = { c_inner_int = "int" c_inner_string = "string" c_inner_timestamp = "timestamp" c_map_1 = "map" c_map_2 = "map>" } c_string = "string" } } } } } ``` 那么下列所有的查询表达式都是有效的 ```sql select name, c_date, c_row, c_row.c_inner_row, c_row.c_string, c_row.c_inner_row.c_inner_int, c_row.c_inner_row.c_inner_string, c_row.c_inner_row.c_inner_timestamp, c_row.c_inner_row.c_map_1, c_row.c_inner_row.c_map_1.some_key ``` 但是这个查询语句是无效的 ```sql select c_row.c_inner_row.c_map_2.some_key.inner_map_key ``` 当查询map结构时,map结构应该为最后一个数据结构,不能查询嵌套map ## 作业配置示例 ``` env { job.mode = "BATCH" } source { FakeSource { plugin_output = "fake" row.num = 100 schema = { fields { id = "int" name = "string" age = "int" } } } } transform { Sql { plugin_input = "fake" plugin_output = "fake1" query = "select id, concat(name, '_') as name, age+1 as age from dual where id>0" } } sink { Console { plugin_input = "fake1" } } ``` ## 更新日志 ### 新版本 - 添加SQL转换连接器 ================================================ FILE: docs/zh/transforms/table-filter.md ================================================ # TableFilter > TableFilter transform plugin ## Description 表过滤 transform,用于正向或者反向过滤部分表 ## Options | name | type | required | default value | Description | |:----------------:|--------|----------|---------------|--------------------------------------------------------| | database_pattern | string | no | | 指定数据库过滤模式,默认值为 null,表示不过滤。如果要过滤数据库名称,请将其设置为正则表达式。 | | schema_pattern | string | no | | 指定 schema 过滤模式,默认值为 null,表示不过滤。如果要过滤架构名称,请将其设置为正则表达式。 | | table_pattern | string | no | | 指定表过滤模式,默认值为 null,表示不过滤。如果要过滤表名称,请将其设置为正则表达式。 | | pattern_mode | string | no | INCLUDE | 指定过滤模式,默认值为 INCLUDE,表示包含匹配的表。如果要排除匹配的表,请将其设置为 EXCLUDE。 | ## Examples ### 包含表过滤 在数据库 "test" 中包含名称与正则表达式 "user_\d+" 匹配的过滤表。 ```hocon transform { TableFilter { plugin_input = "source1" plugin_output = "transform_a_1" database_pattern = "test" table_pattern = "user_\\d+" } } ``` ### 排除表过滤 排除数据库 "test" 中名称与正则表达式 "user_\d+" 匹配的过滤表。 ```hocon transform { TableFilter { plugin_input = "source1" plugin_output = "transform_a_1" database_pattern = "test" table_pattern = "user_\\d+" pattern_mode = "EXCLUDE" } } ``` ================================================ FILE: docs/zh/transforms/table-merge.md ================================================ # TableMerge > TableMerge transform plugin ## Description 表合并插件,用于分库分表合并为一个表。 ## Options | name | type | required | default value | Description | |:--------:|--------|----------|---------------|------------------| | database | string | no | | 指定新的 database 名称 | | schema | string | no | | 指定新的 schema 名称 | | table | string | yes | | 指定新的 table 名称 | ## Examples ### 合并分库分表为一个表 ```hocon env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_1", "source.user_2", "source.shop"] url = "jdbc:mysql://localhost:3306/source" } } transform { TableMerge { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" table_match_regex = "source.user_.*" database = "user_db" table = "user_all" } } sink { Jdbc { plugin_input = "trans_result" driver="com.mysql.cj.jdbc.Driver" url="jdbc:mysql://localhost:3306/sink" user="myuser" password="mypwd" generate_sink_sql = true database = "${database_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/zh/transforms/table-rename.md ================================================ # 表重命名 > TableRename 转换插件 ## 描述 TableRename 转换插件用于重命名表名。 ## 选项 | 参数 | 类型 | 必选 | 默认值 | 说明 | |:-----------------------:|--------|------|--------|---------------------------------------------------------------------------------------------------------| | convert_case | string | 否 | | 字母大小写转换类型,可选 `UPPER`、`LOWER` | | prefix | string | 否 | | 追加到表名前的前缀 | | suffix | string | 否 | | 追加到表名后的后缀 | | replacements_with_regex | array | 否 | | 正则替换规则数组,元素为包含 `replace_from`、`replace_to` 的映射,用于批量替换表名 | ## 示例 ### 将表名转为大写 ``` env { parallelism = 1 job.mode = "STREAMING" } source { MySQL-CDC { plugin_output = "customers_mysql_cdc" username = "root" password = "123456" table-names = ["source.user_shop", "source.user_order"] url = "jdbc:mysql://localhost:3306/source" } } transform { TableRename { plugin_input = "customers_mysql_cdc" plugin_output = "trans_result" convert_case = "UPPER" prefix = "CDC_" suffix = "_TABLE" replacements_with_regex = [ { replace_from = "user" replace_to = "U" } ] } } sink { Jdbc { plugin_input = "trans_result" driver="oracle.jdbc.OracleDriver" url="jdbc:oracle:thin:@oracle-host:1521/ORCLCDB" user="myuser" password="mypwd" generate_sink_sql = true database = "ORCLCDB" table = "${database_name}.${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ### 将表名转为小写 ``` env { parallelism = 1 job.mode = "STREAMING" } source { Oracle-CDC { plugin_output = "customers_oracle_cdc" url = "jdbc:oracle:thin:@localhost:1521/ORCLCDB" username = "dbzuser" password = "dbz" database-names = ["ORCLCDB"] schema-names = ["DEBEZIUM"] table-names = ["SOURCE.USER_SHOP", "SOURCE.USER_ORDER"] } } transform { TableRename { plugin_input = "customers_oracle_cdc" plugin_output = "trans_result" convert_case = "LOWER" prefix = "cdc_" suffix = "_table" replacements_with_regex = [ { replace_from = "USER" replace_to = "u" } ] } } sink { Jdbc { plugin_input = "trans_result" url = "jdbc:mysql://localhost:3306/test" driver = "com.mysql.cj.jdbc.Driver" user = "st_user_sink" password = "mysqlpw" generate_sink_sql = true database = "${schema_name}" table = "${table_name}" primary_keys = ["${primary_key}"] schema_save_mode = "CREATE_SCHEMA_WHEN_NOT_EXIST" data_save_mode = "APPEND_DATA" } } ``` ================================================ FILE: docs/zh/transforms/transform-multi-table.md ================================================ --- sidebar_position: 2 --- # Transform的多表转换 SeaTunnel transform支持多表转换,在上游插件输出多个表的时候特别有用,能够在一个transform中完成所有的转换操作。目前SeaTunnel很多Connectors支持多表输出,比如`JDBCSource`、`MySQL-CDC` 等。所有的Transform都可以通过如下配置实现多表转换。 :::tip 多表Transform没有对Transform能力的限制,任何Transform的配置都可以在多表Transform中使用。多表Transform的作用针对数据流中的多个表进行单独的处理,并将多个表的Transform配置合并到一个Transform中,方便用户管理。 ::: ## 属性 | Name | Type | Required | Default | Description | |----------------------------|--------|----------|---------|--------------------------------------------------------------------------------------------------| | table_match_regex | String | No | .* | 表名的正则表达式,通过正则表达式来匹配需要进行转换的表,默认匹配所有的表。注意这个表名是上游的真正表名,不是`plugin_output`。 | | table_transform | List | No | - | 可以通过table_transform列表来指定部分表的规则,当在table_transform中配置某个表的转换规则后,外层针对当前表的规则不会生效,以table_transform中的为准 | | table_transform.table_path | String | No | - | 当在table_transform中配置某个表的转换规则后,需要使用table_path字段指定表名,表名需要包含`databaseName[.schemaName].tableName`。 | ## 匹配逻辑 假设我们从上游读取了5张表,分别为`test.abc`,`test.abcd`,`test.xyz`,`test.xyzxyz`,`test.www`。他们的表结构一致,都有`id`、`name`、`age`三个字段。 | id | name | age | 现在我们想通过Copy transform将这5张表的数据进行复制,具体需求是,`test.abc`,`test.abcd`表需要将`name`复制为`name1`, `test.xyz`表需要复制为`name2`,`test.xyzxyz`表需要复制为`name3`,`test.www`数据结构不变。那么我们可以通过如下配置来实现: ```hocon transform { Copy { plugin_input = "fake" // 可选的读取数据集名 plugin_output = "fake1" // 可选的输出数据集名 table_match_regex = "test.a.*" // 1. 通过正则表达式匹配需要进行转换的表,test.a.*表示匹配test.abc和test.abcd src_field = "name" // 源字段 dest_field = "name1" // 目标字段 table_transform = [{ table_path = "test.xyz" // 2. 指定表名进行转换 src_field = "name" // 源字段 dest_field = "name2" // 目标字段 }, { table_path = "test.xyzxyz" src_field = "name" dest_field = "name3" }] } } ``` ### 解释 1. 通过第一层的正则表达式,和对应的Copy transform options配置,我们可以匹配到`test.abc`和`test.abcd`表,将`name`字段复制为`name1`。 2. 通过`table_transform`配置,我们可以指定`test.xyz`表,将`name`字段复制为`name2`。 这样我们就可以通过一个transform完成对多个表的转换操作。 对于每个表来说,配置的优先级是:`table_transform` > `table_match_regex`。如果所有的规则都没有匹配到,那么该表将不会进行任何转换操作。 针对每个表来说,他们的Transform配置是: - **test.abc**和**test.abcd** ```hocon transform { Copy { src_field = "name" dest_field = "name1" } } ``` 输出表结构: | id | name | age | name1 | - **test.xyz** ```hocon transform { Copy { src_field = "name" dest_field = "name2" } } ``` 输出表结构: | id | name | age | name2 | - **test.xyzxyz** ```hocon transform { Copy { src_field = "name" dest_field = "name3" } } ``` 输出表结构: | id | name | age | name3 | - **test.www** ```hocon transform { // 无需转换 } ``` 输出表结构: | id | name | age | 我们使用了Copy Transform作为了示例,实际上所有的Transform都支持多表转换,只需要在对应的Transform中配置即可。 ================================================ FILE: mvnw ================================================ #!/bin/sh # ---------------------------------------------------------------------------- # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # ---------------------------------------------------------------------------- # ---------------------------------------------------------------------------- # Maven Start Up Batch script # # Required ENV vars: # ------------------ # JAVA_HOME - location of a JDK home dir # # Optional ENV vars # ----------------- # M2_HOME - location of maven2's installed home dir # MAVEN_OPTS - parameters passed to the Java VM when running Maven # e.g. to debug Maven itself, use # set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 # MAVEN_SKIP_RC - flag to disable loading of mavenrc files # ---------------------------------------------------------------------------- if [ -z "$MAVEN_SKIP_RC" ] ; then if [ -f /usr/local/etc/mavenrc ] ; then . /usr/local/etc/mavenrc fi if [ -f /etc/mavenrc ] ; then . /etc/mavenrc fi if [ -f "$HOME/.mavenrc" ] ; then . "$HOME/.mavenrc" fi fi # OS specific support. $var _must_ be set to either true or false. cygwin=false; darwin=false; mingw=false case "`uname`" in CYGWIN*) cygwin=true ;; MINGW*) mingw=true;; Darwin*) darwin=true # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home # See https://developer.apple.com/library/mac/qa/qa1170/_index.html if [ -z "$JAVA_HOME" ]; then if [ -x "/usr/libexec/java_home" ]; then export JAVA_HOME="`/usr/libexec/java_home`" else export JAVA_HOME="/Library/Java/Home" fi fi ;; esac if [ -z "$JAVA_HOME" ] ; then if [ -r /etc/gentoo-release ] ; then JAVA_HOME=`java-config --jre-home` fi fi if [ -z "$M2_HOME" ] ; then ## resolve links - $0 may be a link to maven's home PRG="$0" # need this for relative symlinks while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG="`dirname "$PRG"`/$link" fi done saveddir=`pwd` M2_HOME=`dirname "$PRG"`/.. # make it fully qualified M2_HOME=`cd "$M2_HOME" && pwd` cd "$saveddir" # echo Using m2 at $M2_HOME fi # For Cygwin, ensure paths are in UNIX format before anything is touched if $cygwin ; then [ -n "$M2_HOME" ] && M2_HOME=`cygpath --unix "$M2_HOME"` [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` fi # For Mingw, ensure paths are in UNIX format before anything is touched if $mingw ; then [ -n "$M2_HOME" ] && M2_HOME="`(cd "$M2_HOME"; pwd)`" [ -n "$JAVA_HOME" ] && JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" fi if [ -z "$JAVA_HOME" ]; then javaExecutable="`which javac`" if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then # readlink(1) is not available as standard on Solaris 10. readLink=`which readlink` if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then if $darwin ; then javaHome="`dirname \"$javaExecutable\"`" javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" else javaExecutable="`readlink -f \"$javaExecutable\"`" fi javaHome="`dirname \"$javaExecutable\"`" javaHome=`expr "$javaHome" : '\(.*\)/bin'` JAVA_HOME="$javaHome" export JAVA_HOME fi fi fi if [ -z "$JAVACMD" ] ; then if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi else JAVACMD="`\\unset -f command; \\command -v java`" fi fi if [ ! -x "$JAVACMD" ] ; then echo "Error: JAVA_HOME is not defined correctly." >&2 echo " We cannot execute $JAVACMD" >&2 exit 1 fi if [ -z "$JAVA_HOME" ] ; then echo "Warning: JAVA_HOME environment variable is not set." fi CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher # traverses directory structure from process work directory to filesystem root # first directory with .mvn subdirectory is considered project base directory find_maven_basedir() { if [ -z "$1" ] then echo "Path not specified to find_maven_basedir" return 1 fi basedir="$1" wdir="$1" while [ "$wdir" != '/' ] ; do if [ -d "$wdir"/.mvn ] ; then basedir=$wdir break fi # workaround for JBEAP-8937 (on Solaris 10/Sparc) if [ -d "${wdir}" ]; then wdir=`cd "$wdir/.."; pwd` fi # end of workaround done echo "${basedir}" } # concatenates all lines of a file concat_lines() { if [ -f "$1" ]; then echo "$(tr -s '\n' ' ' < "$1")" fi } BASE_DIR=`find_maven_basedir "$(pwd)"` if [ -z "$BASE_DIR" ]; then exit 1; fi ########################################################################################## # Extension to allow automatically downloading the maven-wrapper.jar from Maven-central # This allows using the maven wrapper in projects that prohibit checking in binary data. ########################################################################################## if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then if [ "$MVNW_VERBOSE" = true ]; then echo "Found .mvn/wrapper/maven-wrapper.jar" fi else if [ "$MVNW_VERBOSE" = true ]; then echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." fi if [ -n "$MVNW_REPOURL" ]; then jarUrl="$MVNW_REPOURL/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" else jarUrl="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" fi while IFS="=" read key value; do case "$key" in (wrapperUrl) jarUrl="$value"; break ;; esac done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" if [ "$MVNW_VERBOSE" = true ]; then echo "Downloading from: $jarUrl" fi wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" if $cygwin; then wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` fi if command -v wget > /dev/null; then if [ "$MVNW_VERBOSE" = true ]; then echo "Found wget ... using wget" fi if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then wget "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" else wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" || rm -f "$wrapperJarPath" fi elif command -v curl > /dev/null; then if [ "$MVNW_VERBOSE" = true ]; then echo "Found curl ... using curl" fi if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then curl -o "$wrapperJarPath" "$jarUrl" -f else curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f fi else if [ "$MVNW_VERBOSE" = true ]; then echo "Falling back to using Java to download" fi javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" # For Cygwin, switch paths to Windows format before running javac if $cygwin; then javaClass=`cygpath --path --windows "$javaClass"` fi if [ -e "$javaClass" ]; then if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then if [ "$MVNW_VERBOSE" = true ]; then echo " - Compiling MavenWrapperDownloader.java ..." fi # Compiling the Java class ("$JAVA_HOME/bin/javac" "$javaClass") fi if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then # Running the downloader if [ "$MVNW_VERBOSE" = true ]; then echo " - Running MavenWrapperDownloader.java ..." fi ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") fi fi fi fi ########################################################################################## # End of extension ########################################################################################## export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} if [ "$MVNW_VERBOSE" = true ]; then echo $MAVEN_PROJECTBASEDIR fi MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" # For Cygwin, switch paths to Windows format before running java if $cygwin; then [ -n "$M2_HOME" ] && M2_HOME=`cygpath --path --windows "$M2_HOME"` [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` [ -n "$MAVEN_PROJECTBASEDIR" ] && MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` fi # Provide a "standardized" way to retrieve the CLI args that will # work with both Windows and non-Windows executions. MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" export MAVEN_CMD_LINE_ARGS WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain exec "$JAVACMD" \ $MAVEN_OPTS \ $MAVEN_DEBUG_OPTS \ -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ "-Dmaven.home=${M2_HOME}" \ "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" ================================================ FILE: mvnw.cmd ================================================ @REM ---------------------------------------------------------------------------- @REM Licensed to the Apache Software Foundation (ASF) under one @REM or more contributor license agreements. See the NOTICE file @REM distributed with this work for additional information @REM regarding copyright ownership. The ASF licenses this file @REM to you under the Apache License, Version 2.0 (the @REM "License"); you may not use this file except in compliance @REM with the License. You may obtain a copy of the License at @REM @REM http://www.apache.org/licenses/LICENSE-2.0 @REM @REM Unless required by applicable law or agreed to in writing, @REM software distributed under the License is distributed on an @REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @REM KIND, either express or implied. See the License for the @REM specific language governing permissions and limitations @REM under the License. @REM ---------------------------------------------------------------------------- @REM ---------------------------------------------------------------------------- @REM Maven Start Up Batch script @REM @REM Required ENV vars: @REM JAVA_HOME - location of a JDK home dir @REM @REM Optional ENV vars @REM M2_HOME - location of maven2's installed home dir @REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands @REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a keystroke before ending @REM MAVEN_OPTS - parameters passed to the Java VM when running Maven @REM e.g. to debug Maven itself, use @REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 @REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files @REM ---------------------------------------------------------------------------- @REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on' @echo off @REM set title of command window title %0 @REM enable echoing by setting MAVEN_BATCH_ECHO to 'on' @if "%MAVEN_BATCH_ECHO%" == "on" echo %MAVEN_BATCH_ECHO% @REM set %HOME% to equivalent of $HOME if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%") @REM Execute a user defined script before this one if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre @REM check for pre script, once with legacy .bat ending and once with .cmd ending if exist "%USERPROFILE%\mavenrc_pre.bat" call "%USERPROFILE%\mavenrc_pre.bat" %* if exist "%USERPROFILE%\mavenrc_pre.cmd" call "%USERPROFILE%\mavenrc_pre.cmd" %* :skipRcPre @setlocal set ERROR_CODE=0 @REM To isolate internal variables from possible post scripts, we use another setlocal @setlocal @REM ==== START VALIDATION ==== if not "%JAVA_HOME%" == "" goto OkJHome echo. echo Error: JAVA_HOME not found in your environment. >&2 echo Please set the JAVA_HOME variable in your environment to match the >&2 echo location of your Java installation. >&2 echo. goto error :OkJHome if exist "%JAVA_HOME%\bin\java.exe" goto init echo. echo Error: JAVA_HOME is set to an invalid directory. >&2 echo JAVA_HOME = "%JAVA_HOME%" >&2 echo Please set the JAVA_HOME variable in your environment to match the >&2 echo location of your Java installation. >&2 echo. goto error @REM ==== END VALIDATION ==== :init @REM Find the project base dir, i.e. the directory that contains the folder ".mvn". @REM Fallback to current working directory if not found. set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR% IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir set EXEC_DIR=%CD% set WDIR=%EXEC_DIR% :findBaseDir IF EXIST "%WDIR%"\.mvn goto baseDirFound cd .. IF "%WDIR%"=="%CD%" goto baseDirNotFound set WDIR=%CD% goto findBaseDir :baseDirFound set MAVEN_PROJECTBASEDIR=%WDIR% cd "%EXEC_DIR%" goto endDetectBaseDir :baseDirNotFound set MAVEN_PROJECTBASEDIR=%EXEC_DIR% cd "%EXEC_DIR%" :endDetectBaseDir IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig @setlocal EnableExtensions EnableDelayedExpansion for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a @endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS% :endReadAdditionalConfig SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe" set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar" set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" FOR /F "usebackq tokens=1,2 delims==" %%A IN ("%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties") DO ( IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B ) @REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central @REM This allows using the maven wrapper in projects that prohibit checking in binary data. if exist %WRAPPER_JAR% ( if "%MVNW_VERBOSE%" == "true" ( echo Found %WRAPPER_JAR% ) ) else ( if not "%MVNW_REPOURL%" == "" ( SET DOWNLOAD_URL="%MVNW_REPOURL%/org/apache/maven/wrapper/maven-wrapper/3.1.0/maven-wrapper-3.1.0.jar" ) if "%MVNW_VERBOSE%" == "true" ( echo Couldn't find %WRAPPER_JAR%, downloading it ... echo Downloading from: %DOWNLOAD_URL% ) powershell -Command "&{"^ "$webclient = new-object System.Net.WebClient;"^ "if (-not ([string]::IsNullOrEmpty('%MVNW_USERNAME%') -and [string]::IsNullOrEmpty('%MVNW_PASSWORD%'))) {"^ "$webclient.Credentials = new-object System.Net.NetworkCredential('%MVNW_USERNAME%', '%MVNW_PASSWORD%');"^ "}"^ "[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12; $webclient.DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"^ "}" if "%MVNW_VERBOSE%" == "true" ( echo Finished downloading %WRAPPER_JAR% ) ) @REM End of extension @REM Provide a "standardized" way to retrieve the CLI args that will @REM work with both Windows and non-Windows executions. set MAVEN_CMD_LINE_ARGS=%* %MAVEN_JAVA_EXE% ^ %JVM_CONFIG_MAVEN_PROPS% ^ %MAVEN_OPTS% ^ %MAVEN_DEBUG_OPTS% ^ -classpath %WRAPPER_JAR% ^ "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" ^ %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %* if ERRORLEVEL 1 goto error goto end :error set ERROR_CODE=1 :end @endlocal & set ERROR_CODE=%ERROR_CODE% if not "%MAVEN_SKIP_RC%"=="" goto skipRcPost @REM check for post script, once with legacy .bat ending and once with .cmd ending if exist "%USERPROFILE%\mavenrc_post.bat" call "%USERPROFILE%\mavenrc_post.bat" if exist "%USERPROFILE%\mavenrc_post.cmd" call "%USERPROFILE%\mavenrc_post.cmd" :skipRcPost @REM pause the script if MAVEN_BATCH_PAUSE is set to 'on' if "%MAVEN_BATCH_PAUSE%"=="on" pause if "%MAVEN_TERMINATE_CMD%"=="on" exit %ERROR_CODE% cmd /C exit /B %ERROR_CODE% ================================================ FILE: plugin-mapping.properties ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This mapping is used to resolve the Jar package name without version (or call artifactId) # corresponding to the module in the user Config, helping SeaTunnel to load the correct Jar package. ## *** WARNING **** : `seatunnel.source.XXX`, the `XXX` should be string which SeaTunnelSource::getPluginName and TableSinkFactory::factoryIdentifier returned value## # SeaTunnel Connector-V2 seatunnel.source.FakeSource = connector-fake seatunnel.sink.Console = connector-console seatunnel.sink.Assert = connector-assert seatunnel.source.Kafka = connector-kafka seatunnel.sink.Kafka = connector-kafka seatunnel.source.Http = connector-http-base seatunnel.sink.Http = connector-http-base seatunnel.sink.Feishu = connector-http-feishu seatunnel.source.Socket = connector-socket seatunnel.sink.Hive = connector-hive seatunnel.source.Hive = connector-hive seatunnel.source.Clickhouse = connector-clickhouse seatunnel.sink.Clickhouse = connector-clickhouse seatunnel.sink.ClickhouseFile = connector-clickhouse seatunnel.source.Jdbc = connector-jdbc seatunnel.sink.Jdbc = connector-jdbc seatunnel.source.Kudu = connector-kudu seatunnel.sink.Kudu = connector-kudu seatunnel.sink.EmailSink = connector-email seatunnel.source.HdfsFile = connector-file-hadoop seatunnel.sink.HdfsFile = connector-file-hadoop seatunnel.source.LocalFile = connector-file-local seatunnel.sink.LocalFile = connector-file-local seatunnel.source.OssFile = connector-file-oss seatunnel.sink.OssFile = connector-file-oss seatunnel.source.OssJindoFile = connector-file-jindo-oss seatunnel.sink.OssJindoFile = connector-file-jindo-oss seatunnel.source.CosFile = connector-file-cos seatunnel.sink.CosFile = connector-file-cos seatunnel.source.Pulsar = connector-pulsar seatunnel.sink.DingTalk = connector-dingtalk seatunnel.source.Elasticsearch = connector-elasticsearch seatunnel.sink.Elasticsearch = connector-elasticsearch seatunnel.source.IoTDB = connector-iotdb seatunnel.sink.IoTDB = connector-iotdb seatunnel.source.IoTDBv2 = connector-iotdb-v2 seatunnel.sink.IoTDBv2 = connector-iotdb-v2 seatunnel.source.Neo4j = connector-neo4j seatunnel.sink.Neo4j = connector-neo4j seatunnel.source.FtpFile = connector-file-ftp seatunnel.sink.FtpFile = connector-file-ftp seatunnel.source.SftpFile = connector-file-sftp seatunnel.sink.SftpFile = connector-file-sftp seatunnel.sink.Socket = connector-socket seatunnel.source.Redis = connector-redis seatunnel.sink.Redis = connector-redis seatunnel.sink.Databend = connector-databend seatunnel.source.Databend = connector-databend seatunnel.sink.DataHub = connector-datahub seatunnel.sink.Sentry = connector-sentry seatunnel.source.MongoDB = connector-mongodb seatunnel.sink.MongoDB = connector-mongodb seatunnel.source.Iceberg = connector-iceberg seatunnel.sink.Iceberg = connector-iceberg seatunnel.source.InfluxDB = connector-influxdb seatunnel.source.S3File = connector-file-s3 seatunnel.sink.S3File = connector-file-s3 seatunnel.source.AmazonDynamodb = connector-amazondynamodb seatunnel.sink.AmazonDynamodb = connector-amazondynamodb seatunnel.source.Cassandra = connector-cassandra seatunnel.sink.Cassandra = connector-cassandra seatunnel.sink.StarRocks = connector-starrocks seatunnel.source.MyHours = connector-http-myhours seatunnel.sink.InfluxDB = connector-influxdb seatunnel.source.GoogleSheets = connector-google-sheets seatunnel.sink.GoogleFirestore = connector-google-firestore seatunnel.sink.Tablestore = connector-tablestore seatunnel.source.Tablestore = connector-tablestore seatunnel.source.Lemlist = connector-http-lemlist seatunnel.source.Klaviyo = connector-http-klaviyo seatunnel.sink.Slack = connector-slack seatunnel.source.OneSignal = connector-http-onesignal seatunnel.source.Jira = connector-http-jira seatunnel.source.Gitlab = connector-http-gitlab seatunnel.source.Github = connector-http-github seatunnel.source.Notion = connector-http-notion seatunnel.source.Airtable = connector-http-airtable seatunnel.sink.Airtable = connector-http-airtable seatunnel.sink.RabbitMQ = connector-rabbitmq seatunnel.source.RabbitMQ = connector-rabbitmq seatunnel.source.OpenMldb = connector-openmldb seatunnel.source.SqlServer-CDC = connector-cdc-sqlserver seatunnel.source.Doris = connector-doris seatunnel.sink.Doris = connector-doris seatunnel.source.Maxcompute = connector-maxcompute seatunnel.sink.Maxcompute = connector-maxcompute seatunnel.source.MySQL-CDC = connector-cdc-mysql seatunnel.source.MongoDB-CDC = connector-cdc-mongodb seatunnel.source.TiDB-CDC = connector-cdc-tidb seatunnel.sink.S3Redshift = connector-s3-redshift seatunnel.source.Web3j = connector-web3j seatunnel.source.TDengine = connector-tdengine seatunnel.sink.TDengine = connector-tdengine seatunnel.source.Persistiq = connector-http-persistiq seatunnel.sink.SelectDBCloud = connector-selectdb-cloud seatunnel.source.Hbase = connector-hbase seatunnel.sink.Hbase = connector-hbase seatunnel.source.StarRocks = connector-starrocks seatunnel.source.Rocketmq = connector-rocketmq seatunnel.sink.Rocketmq = connector-rocketmq seatunnel.source.AmazonSqs = connector-amazonsqs seatunnel.sink.AmazonSqs = connector-amazonsqs seatunnel.source.Paimon = connector-paimon seatunnel.sink.Paimon = connector-paimon seatunnel.sink.hudi = connector-hudi seatunnel.sink.Druid = connector-druid seatunnel.source.Easysearch = connector-easysearch seatunnel.sink.Easysearch = connector-easysearch seatunnel.source.Postgres-CDC = connector-cdc-postgres seatunnel.source.Oracle-CDC = connector-cdc-oracle seatunnel.sink.Pulsar = connector-pulsar seatunnel.source.ObsFile = connector-file-obs seatunnel.sink.ObsFile = connector-file-obs seatunnel.source.Milvus = connector-milvus seatunnel.sink.Milvus = connector-milvus seatunnel.sink.ActiveMQ = connector-activemq seatunnel.source.Prometheus = connector-prometheus seatunnel.sink.Prometheus = connector-prometheus seatunnel.source.Qdrant = connector-qdrant seatunnel.sink.Qdrant = connector-qdrant seatunnel.source.Sls = connector-sls seatunnel.sink.Sls = connector-sls seatunnel.source.Typesense = connector-typesense seatunnel.sink.Typesense = connector-typesense seatunnel.source.Opengauss-CDC = connector-cdc-opengauss seatunnel.source.GraphQL = connector-graphql seatunnel.sink.GraphQL = connector-graphql seatunnel.sink.Aerospike = connector-aerospike seatunnel.sink.SensorsData = connector-sensorsdata seatunnel.sink.HugeGraph = connector-hugegraph seatunnel.sink.Fluss = connector-fluss seatunnel.sink.Lance = connector-lance # For custom transforms, make sure to use the seatunnel.transform.[PluginIdentifier]=[JarPerfix] naming convention. For example: # seatunnel.transform.Sql = seatunnel-transforms-v2 ================================================ FILE: plugins/README.md ================================================ # Connector Isolated Dependency Loading Mechanism SeaTunnel provides an isolated dependency loading mechanism for each connector, making it easier for users to manage individual dependencies for different connectors, while avoiding dependency conflicts and improving system extensibility. When loading a connector, SeaTunnel will search for and load the connector's own dependency jars from the `${SEATUNNEL_HOME}/plugins/connector-xxx` directory. This ensures that the dependencies required by different connectors do not interfere with each other, which is helpful for managing a large number of connectors in complex environments. ## Principle Each connector needs to place its own dependency jars in a dedicated subdirectory under `${SEATUNNEL_HOME}/plugins/connector-xxx` (manual creation required). The subdirectory name is specified by the value in the `plugin-mapping` file. When SeaTunnel starts and loads connectors, it will only load jars from the corresponding directory, thus achieving dependency isolation. Currently, the Zeta engine ensures that jars for different connectors in the same job are loaded separately. The other two engines still load all connector dependency jars together, so placing different versions of jars for the same job in Spark/Flink environments may cause dependency conflicts. ## Directory Structure Example - Use `${SEATUNNEL_HOME}/connectors/plugin-mapping.properties` to get the folder name for each connector. For example, for AmazonDynamodb, suppose the following configuration exists in the `plugin-mapping` file: ``` seatunnel.source.AmazonDynamodb = connector-amazondynamodb ``` The corresponding connector dependency directory is the value `connector-amazondynamodb`. The final directory structure is as follows: ``` SEATUNNEL_HOME/ plugins/ connector-amazondynamodb/ dependency1.jar dependency2.jar connector-xxx/ dependencyA.jar dependencyB.jar ``` ## Limitations - For the Zeta engine, please ensure that the `${SEATUNNEL_HOME}/plugins/connector-xxx` directory structure is consistent across all nodes. Each node must contain the same subdirectories and dependency jars. - Any directory or jar that does not start with `connector-` will be treated as a common dependency directory, and all engines and connectors will load such jars. - In the Zeta engine, you can achieve shared dependencies for all connectors by placing common jars in the `${SEATUNNEL_HOME}/lib/` directory. ## Verification - By checking the job logs, you can confirm that each connector only loads its own dependency jars. ```log 2025-08-13T17:55:48.7732601Z [] 2025-08-13 17:55:47,270 INFO org.apache.seatunnel.plugin.discovery.AbstractPluginDiscovery - find connector jar and dependency for PluginIdentifier{engineType='seatunnel', pluginType='source', pluginName='Jdbc'}: [file:/tmp/seatunnel/plugins/Jdbc/lib/vertica-jdbc-12.0.3-0.jar, file:/tmp/seatunnel/connectors/connector-jdbc-3.0.0-SNAPSHOT-2.12.15.jar] ``` ================================================ FILE: pom.xml ================================================ 4.0.0 org.apache apache 31 org.apache.seatunnel seatunnel ${revision} pom SeaTunnel : Production ready big data processing product based on Apache Spark and Apache Flink. seatunnel-config seatunnel-common seatunnel-core seatunnel-transforms-v2 seatunnel-connectors-v2 seatunnel-api seatunnel-translation seatunnel-plugin-discovery seatunnel-formats seatunnel-engine seatunnel-examples seatunnel-e2e seatunnel-shade seatunnel-ci-tools 3.0.0-SNAPSHOT 2.1.1 UTF-8 1.8 2.12.15 2.12 ${java.version} ${java.version} 1.2.1 2.0.9 1.7.36 2.17.1 3.4.4 1.2.17 1.2.3 1.2 1.13.6 1.15.3 1.20.1 2.4.0 3.3.0 2.4 1.9.4 1.4 1.7 1.8.1 1.9.13 1.19 2.1 2.7 2.13.3 1.18.24 1.20 1.11.1 false false false 2.22.2 2.22.2 1.6.8 3.0.1 2.9.1 2.8.2 3.10.1 3.8 6.3.1 7.5.1 2.7.5-7.0 3.18.0 2.11.0 4.4 1.10.0 3.3.0 1.8.0 provided provided 1.13 3.0.0 apache seatunnel ${project.version} true true true 1.81 4.13.2 5.9.0 5.4.0 4.11.0 1.3.3 3.4.1 3.2.0 4.0.4 1.3.0 1.20 2.17.1 0.38.0 3.1.1 1.3.0 2.0.0 1.17.6 2.29.0 4.9 2.7.0 4.0.16 2.12.15 9.4.56.v20240826 4.0.4 1.5.0 false true 7 27.0-jre 1.0.1 2.6.5 org.apache.seatunnel.shade 1.1.8.3 3.10.0 4.2.0 true false 0.16.0 true 3.1.4 2.31.30 15.0.1 4.12.0 org.slf4j slf4j-api ${slf4j.version} org.apache.logging.log4j log4j-slf4j-impl ${log4j2.version} org.apache.logging.log4j log4j-api ${log4j2.version} org.apache.logging.log4j log4j-core ${log4j2.version} com.lmax disruptor ${log4j2-disruptor.version} org.xerial.snappy snappy-java ${snappy-java.version} org.slf4j jcl-over-slf4j ${slf4j.version} org.apache.logging.log4j log4j-1.2-api ${log4j2.version} org.slf4j log4j-over-slf4j ${slf4j.version} provided org.slf4j slf4j-log4j12 ${slf4j.version} provided org.apache.logging.log4j log4j-to-slf4j ${log4j2.version} provided org.slf4j slf4j-jdk14 ${slf4j.version} provided org.slf4j slf4j-jcl ${slf4j.version} provided org.slf4j slf4j-nop ${slf4j.version} provided org.slf4j slf4j-simple ${slf4j.version} provided org.slf4j slf4j-reload4j ${slf4j.version} provided commons-logging commons-logging ${commons-logging.version} provided log4j log4j ${log4j.version} provided ch.qos.logback logback-classic ${logback.version} provided ch.qos.logback logback-core ${logback.version} provided org.apache.seatunnel seatunnel-config-shade ${project.version} commons-codec commons-codec ${codec.version} com.squareup.okhttp3 okhttp ${okhttp.version} com.squareup.okhttp3 mockwebserver ${okhttp.version} test org.apache.flink flink-shaded-hadoop-2 ${flink-shaded-hadoop-2.version} xml-apis xml-apis org.slf4j slf4j-log4j12 org.projectlombok lombok ${lombok.version} provided org.apache.commons commons-lang3 ${commons-lang3.version} org.apache.commons commons-collections4 ${commons-collections4.version} org.apache.commons commons-csv ${commons-csv.version} com.beust jcommander ${jcommander.version} org.junit junit-bom ${junit5.version} pom import junit junit ${junit4.version} org.mockito mockito-junit-jupiter ${mockito.version} test com.fasterxml.jackson.core jackson-annotations ${jackson.version} com.fasterxml.jackson.datatype jackson-datatype-jsr310 ${jackson.version} com.fasterxml.jackson.core jackson-core ${jackson.version} com.fasterxml.jackson.core jackson-databind ${jackson.version} org.apache.commons commons-compress ${commons-compress.version} org.testcontainers testcontainers ${testcontainer.version} test org.slf4j slf4j-api junit junit com.typesafe config ${config.version} org.scala-lang scala-library ${scala.version} com.google.guava guava ${guava.version} org.checkerframework checker-qual ${checker.qual.version} org.awaitility awaitility ${awaitility.version} test commons-io commons-io ${commons-io.version} io.protostuff protostuff-core ${protostuff.version} io.protostuff protostuff-runtime ${protostuff.version} com.google.auto.service auto-service ${auto-service.version} provided org.apache.seatunnel seatunnel-hadoop3-3.1.4-uber ${project.version} optional provided org.apache.arrow arrow-vector ${arrow.version} org.apache.arrow arrow-memory-netty ${arrow.version} org.apache.hugegraph hugegraph-client ${hugegraph.client.version} com.google.auto.service auto-service ${auto-service.version} provided org.projectlombok lombok org.slf4j slf4j-api org.apache.logging.log4j log4j-slf4j-impl org.apache.logging.log4j log4j-api org.apache.logging.log4j log4j-core org.slf4j jcl-over-slf4j org.apache.logging.log4j log4j-1.2-api org.junit.jupiter junit-jupiter-engine test org.junit.jupiter junit-jupiter-params test org.mockito mockito-junit-jupiter test org.mockito mockito-inline ${mockito.version} test com.github.stefanbirkner system-lambda ${system-rules.version} test org.powermock powermock-module-junit4 ${powermock.version} test org.powermock powermock-api-mockito2 ${powermock.version} test io.prometheus simpleclient ${prometheus.simpleclient.version} io.prometheus simpleclient_hotspot ${prometheus.simpleclient.version} io.prometheus simpleclient_httpserver ${prometheus.simpleclient.version} ${project.artifactId}-${project.version}-${scala.version} org.apache.maven.plugins maven-compiler-plugin ${maven-compiler-plugin.version} ${maven.compiler.source} ${maven.compiler.target} true org.apache.maven.plugins maven-surefire-plugin ${maven-surefire-plugin.version} ${skipUT} ${project.build.directory}/jacoco.exec **/*IT.java org.slf4j:slf4j-jdk14 org.slf4j:slf4j-jcl org.slf4j:slf4j-nop org.slf4j:slf4j-simple org.slf4j:slf4j-reload4j org.slf4j:slf4j-log4j12 org.slf4j:log4j-over-slf4j commons-logging:commons-logging log4j:log4j ch.qos.logback:logback-classic ch.qos.logback:logback-core org.apache.logging.log4j:log4j-to-slf4j org.apache.maven.plugins maven-failsafe-plugin ${maven-failsafe-plugin.version} ${skipIT} integration-test verify io.fabric8 docker-maven-plugin ${docker-maven-plugin.version} org.apache.maven.plugins maven-shade-plugin ${maven-shade-plugin.version} false true true org.slf4j:* ch.qos.logback:* log4j:* org.apache.logging.log4j:* commons-logging:* *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA shade package org.apache.maven.plugins maven-assembly-plugin ${maven-assembly-plugin.version} org.apache.maven.plugins maven-source-plugin ${maven-source-plugin.version} attach-sources jar-no-fork org.apache.maven.plugins maven-javadoc-plugin ${maven-javadoc-plugin.version} ${maven.compiler.source} false true ${maven.javadoc.skip} -Xdoclint:none attach-javadocs jar org.codehaus.mojo build-helper-maven-plugin ${maven-helper-plugin.version} pl.project13.maven git-commit-id-plugin ${maven-git-commit-id-plugin.version} org.codehaus.mojo license-maven-plugin ${maven-license-maven-plugin} ${project.basedir}/seatunnel-dist/target/ THIRD-PARTY.txt false false true 30000 30000 30000 test,provided org.codehaus.mojo flatten-maven-plugin ${flatten-maven-plugin.version} true resolveCiFriendliesOnly flatten flatten process-resources flatten.clean clean clean org.apache.maven.plugins maven-dependency-plugin ${maven-dependency-plugin.version} true org.apache.maven.plugins maven-compiler-plugin UTF-8 org.apache.maven.plugins maven-release-plugin true @{project.version} ${project.version} org.apache.maven.scm maven-scm-provider-jgit ${maven-scm-provider-jgit.version} org.apache.maven.plugins maven-surefire-plugin org.apache.maven.plugins maven-failsafe-plugin org.codehaus.mojo license-maven-plugin org.codehaus.mojo flatten-maven-plugin com.diffplug.spotless spotless-maven-plugin ${spotless.version} ${skip.spotless} src/main/java/org/apache/seatunnel/antlr4/generated/*.* 1.7 org.apache.seatunnel.shade,org.apache.seatunnel,org.apache,org,,javax,java,\# Remove wildcard imports import\s+(static)*\s*[^\*\s]+\*;(\r\n|\r|\n) $1 Block powermock import\s+org\.powermock\.[^\*\s]*(|\*);(\r\n|\r|\n) $1 Block jUnit4 imports import\s+org\.junit\.[^jupiter][^\*\s]*(|\*);(\r\n|\r|\n) $1 Convert Google Guava imports to shade import\s+(static\s+)?com\.google\.common\.([^;]+);(\r\n|\r|\n) import $1org.apache.seatunnel.shade.com.google.common.$2;$3 Convert Jetty imports to shade import\s+(static\s+)?org\.eclipse\.jetty\.([^;]+);(\r\n|\r|\n) import $1org.apache.seatunnel.shade.org.eclipse.jetty.$2;$3 Convert Hikari imports to shade import\s+(static\s+)?com\.zaxxer\.hikari\.([^;]+);(\r\n|\r|\n) import $1org.apache.seatunnel.shade.com.zaxxer.hikari.$2;$3 Convert Janino imports to shade import\s+(static\s+)?org\.codehaus\.(janino|commons)\.([^;]+);(\r\n|\r|\n) import $1org.apache.seatunnel.shade.org.codehaus.$2.$3;$4 Convert Apache Commons Lang3 imports to shade import\s+(static\s+)?org\.apache\.commons\.lang3\.([^;]+);(\r\n|\r|\n) import $1org.apache.seatunnel.shade.org.apache.commons.lang3.$2;$3 UTF-8 4 true false true true false false custom_1 false false Leading blank line project project true spotless-check check validate https://github.com/apache/seatunnel The Apache License, Version 2.0 https://www.apache.org/licenses/LICENSE-2.0.txt SeaTunnel Developer List dev-subscribe@seatunnel.apache.org dev-unsubscribe@seatunnel.apache.org dev@seatunnel.apache.org SeaTunnel Commits List commits-subscribe@seatunnel.apache.org commits-unsubscribe@seatunnel.apache.org commits@seatunnel.apache.org scm:git:https://github.com/apache/seatunnel.git scm:git:https://github.com/apache/seatunnel.git https://github.com/apache/seatunnel HEAD GitHub https://github.com/apache/seatunnel/issues release true seatunnel-dist ci false ================================================ FILE: seatunnel-api/pom.xml ================================================ 4.0.0 org.apache.seatunnel seatunnel ${revision} seatunnel-api SeaTunnel : Api 4.5.13 4.4.16 org.apache.seatunnel seatunnel-common ${project.version} org.apache.seatunnel seatunnel-jackson ${project.version} optional org.apache.httpcomponents httpclient ${httpclient.version} org.apache.httpcomponents httpcore ${httpcore.version} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/annotation/Experimental.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.annotation; import java.lang.annotation.Documented; import java.lang.annotation.ElementType; import java.lang.annotation.Target; /** Annotation to mark classes, methods, fields, constructors as experimental. */ @Documented @Target({ElementType.TYPE, ElementType.METHOD, ElementType.FIELD, ElementType.CONSTRUCTOR}) public @interface Experimental {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/JobContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; import org.apache.seatunnel.common.constants.JobMode; import lombok.Getter; import java.io.Serializable; import java.util.UUID; /** This class is used to store the context of the job. e.g. the job id, job mode ...etc. */ @Getter public final class JobContext implements Serializable { private static final long serialVersionUID = -1L; private JobMode jobMode; private boolean enableCheckpoint; private final String jobId; public JobContext() { this.jobId = UUID.randomUUID().toString().replace("-", ""); } public JobContext(Long jobId) { this.jobId = jobId + ""; } public JobContext setJobMode(JobMode jobMode) { this.jobMode = jobMode; return this; } public JobContext setEnableCheckpoint(boolean enableCheckpoint) { this.enableCheckpoint = enableCheckpoint; return this; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/PluginIdentifier.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; /** Used to identify a plugin. */ public class PluginIdentifier { private final String engineType; private final String pluginType; private final String pluginName; private PluginIdentifier(String engineType, String pluginType, String pluginName) { this.engineType = engineType; this.pluginType = pluginType; this.pluginName = pluginName; } public static PluginIdentifier of(String engineType, String pluginType, String pluginName) { return new PluginIdentifier(engineType, pluginType, pluginName); } public String getEngineType() { return engineType; } public String getPluginType() { return pluginType; } public String getPluginName() { return pluginName; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } PluginIdentifier that = (PluginIdentifier) o; if (!StringUtils.equalsIgnoreCase(engineType, that.engineType)) { return false; } if (!StringUtils.equalsIgnoreCase(pluginType, that.pluginType)) { return false; } return StringUtils.equalsIgnoreCase(pluginName, that.pluginName); } @Override public int hashCode() { int result = engineType != null ? engineType.toLowerCase().hashCode() : 0; result = 31 * result + (pluginType != null ? pluginType.toLowerCase().hashCode() : 0); result = 31 * result + (pluginName != null ? pluginName.toLowerCase().hashCode() : 0); return result; } @Override public String toString() { return "PluginIdentifier{" + "engineType='" + engineType + '\'' + ", pluginType='" + pluginType + '\'' + ", pluginName='" + pluginName + '\'' + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/PluginIdentifierInterface.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; /** todo: unified with Plugin */ public interface PluginIdentifierInterface { /** * Returns a unique identifier among same factory interfaces. * *

    For consistency, an identifier should be declared as one lower case word (e.g. {@code * kafka}). If multiple factories exist for different versions, a version should be appended * using "-" (e.g. {@code elasticsearch-7}). */ String getPluginName(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/PrepareFailException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.common.constants.PluginType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; /** This exception will throw when {@link SeaTunnelPluginLifeCycle#prepare(Config)} failed. */ public class PrepareFailException extends SeaTunnelRuntimeException { public PrepareFailException(String pluginName, PluginType type, String message) { super( SeaTunnelAPIErrorCode.CONFIG_VALIDATION_FAILED, String.format( "PluginName: %s, PluginType: %s, Message: %s", pluginName, type.getType(), message)); } public PrepareFailException( String pluginName, PluginType type, String message, Throwable cause) { super( SeaTunnelAPIErrorCode.CONFIG_VALIDATION_FAILED, String.format( "PluginName: %s, PluginType: %s, Message: %s", pluginName, type.getType(), message), cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/SeaTunnelAPIErrorCode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; import org.apache.seatunnel.common.exception.SeaTunnelErrorCode; public enum SeaTunnelAPIErrorCode implements SeaTunnelErrorCode { CONFIG_VALIDATION_FAILED("API-01", "Configuration item validate failed"), OPTION_VALIDATION_FAILED("API-02", "Option item validate failed"), CATALOG_INITIALIZE_FAILED("API-03", "Catalog initialize failed"), DATABASE_NOT_EXISTED("API-04", "Database not existed"), TABLE_NOT_EXISTED("API-05", "Table not existed"), FACTORY_INITIALIZE_FAILED("API-06", "Factory initialize failed"), DATABASE_ALREADY_EXISTED("API-07", "Database already existed"), TABLE_ALREADY_EXISTED("API-08", "Table already existed"), HANDLE_SAVE_MODE_FAILED("API-09", "Handle save mode failed"), SOURCE_ALREADY_HAS_DATA("API-10", "The target data source already has data"), SINK_TABLE_NOT_EXIST("API-11", "The sink table not exist"), LIST_DATABASES_FAILED("API-12", "List databases failed"), LIST_TABLES_FAILED("API-13", "List tables failed"), GET_PRIMARY_KEY_FAILED("API-14", "Get primary key failed"); private final String code; private final String description; SeaTunnelAPIErrorCode(String code, String description) { this.code = code; this.description = description; } @Override public String getCode() { return code; } @Override public String getDescription() { return description; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/SeaTunnelPluginLifeCycle.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common; import org.apache.seatunnel.shade.com.typesafe.config.Config; /** * This interface is the life cycle of a plugin, after a plugin created, will execute prepare method * to do some initialize operation. * * @deprecated SeaTunnel will not invoke prepare when init plugin, instead by {@link * org.apache.seatunnel.api.table.factory.Factory} */ @Deprecated public interface SeaTunnelPluginLifeCycle { /** * Use the pluginConfig to do some initialize operation. * * @param pluginConfig plugin config. * @throws PrepareFailException if plugin prepare failed, the {@link PrepareFailException} will * throw. * @deprecated SeaTunnel will not invoke prepare when init plugin, instead by {@link * org.apache.seatunnel.api.table.factory.Factory} */ @Deprecated default void prepare(Config pluginConfig) throws PrepareFailException { throw new UnsupportedOperationException("prepare method is not supported"); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/AbstractMetricsContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import lombok.extern.slf4j.Slf4j; import java.io.Serializable; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @Slf4j public abstract class AbstractMetricsContext implements MetricsContext, Serializable { private static final long serialVersionUID = 1L; protected final Map metrics = new ConcurrentHashMap<>(); @Override public Counter counter(String name) { if (metrics.containsKey(name)) { return (Counter) metrics.get(name); } return this.counter(name, new ThreadSafeCounter(name)); } @Override public C counter(String name, C counter) { this.addMetric(name, counter); return counter; } @Override public Meter meter(String name) { if (metrics.containsKey(name)) { return (Meter) metrics.get(name); } return this.meter(name, new ThreadSafeQPSMeter(name)); } @Override public M meter(String name, M meter) { this.addMetric(name, meter); return meter; } protected void addMetric(String name, Metric metric) { if (metric == null) { log.warn("Ignoring attempted add of a metric due to being null for name {}.", name); } else { synchronized (this) { Metric prior = this.metrics.put(name, metric); if (prior != null) { this.metrics.put(name, prior); log.warn( "Name collision: MetricsContext already contains a Metric with the name '" + name + "'. Metric will not be reported."); } } } } @Override public String toString() { return "AbstractMetricsContext{" + "metrics=" + metrics + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/Counter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; /** A Counter is a {@link Metric} that measures a count. */ public interface Counter extends Metric { /** Increment the current count by 1. */ void inc(); /** * Increment the current count by the given value. * * @param n value to increment the current count by */ void inc(long n); /** Decrement the current count by 1. */ void dec(); /** * Decrement the current count by the given value. * * @param n value to decrement the current count by */ void dec(long n); /** Sets the current value. */ void set(long n); /** * Returns the current count. * * @return current count */ long getCount(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/JobMetrics.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.JsonProcessingException; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.SerializationFeature; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.node.ObjectNode; import lombok.Getter; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Objects; import java.util.Set; import java.util.function.Predicate; import java.util.stream.Collector; import java.util.stream.Collectors; import static java.util.stream.Collectors.groupingBy; public final class JobMetrics implements Serializable { private static final JobMetrics EMPTY = new JobMetrics(Collections.emptyMap()); private static final Collector>> COLLECTOR = Collectors.groupingBy(Measurement::metric); @Getter private Map> metrics; // metric name -> set of measurements JobMetrics() { // needed for deserialization } private JobMetrics(Map> metrics) { this.metrics = new HashMap<>(metrics); } /** Returns an empty {@link JobMetrics} object. */ public static JobMetrics empty() { return EMPTY; } /** Builds a {@link JobMetrics} object based on a map of {@link Measurement}s. */ public static JobMetrics of(Map> metrics) { return new JobMetrics(metrics); } public JobMetrics merge(JobMetrics jobMetrics) { if (jobMetrics == null) { return this; } Map> metricsMap = new HashMap<>(); metrics.forEach((key, value) -> metricsMap.put(key, new ArrayList<>(value))); //// Because if a job is restarted, the running node might change, so we need to remove the // node information. Set keysToExclude = new HashSet<>(Arrays.asList(MetricTags.MEMBER, MetricTags.ADDRESS)); jobMetrics.metrics.forEach( (key, value) -> metricsMap.merge( key, value, (v1, v2) -> { List ms = new ArrayList<>(v2); for (Measurement m1 : v1) { if (v2.stream() .noneMatch( m2 -> areMapsEqualExcludingKeys( m2.getTags(), m1.getTags(), keysToExclude))) { ms.add(m1); } } return ms; })); return new JobMetrics(metricsMap); } /** * Compares two Map objects excluding certain keys. * * @param map1 the first map * @param map2 the second map * @param keysToExclude the keys to be excluded during comparison * @return true if the maps are equal excluding the specific keys, false otherwise */ public static boolean areMapsEqualExcludingKeys( Map map1, Map map2, Set keysToExclude) { // Return false if either of the maps is null if (map1 == null || map2 == null) { return false; } // Return false if the sizes of the maps are different if (map1.size() != map2.size()) { return false; } // Create copies of the maps to avoid modifying the original maps Map map1Copy = new HashMap<>(map1); Map map2Copy = new HashMap<>(map2); // Remove specific keys from the copies for (String key : keysToExclude) { map1Copy.remove(key); map2Copy.remove(key); } // Return whether the copies are equal return map1Copy.equals(map2Copy); } /** Returns all metrics present. */ public Set metrics() { return Collections.unmodifiableSet(metrics.keySet()); } /** * Returns all {@link Measurement}s associated with a given metric name. * *

    For a list of job-specific metric names please see {@link MetricNames}. */ public List get(String metricName) { Objects.requireNonNull(metricName); List measurements = metrics.get(metricName); return measurements == null ? Collections.emptyList() : measurements; } public JobMetrics filter(String tagName, String tagValue) { return filter(MeasurementPredicates.tagValueEquals(tagName, tagValue)); } public JobMetrics filter(Predicate predicate) { Objects.requireNonNull(predicate, "predicate"); Map> filteredMetrics = metrics.values().stream() .flatMap(List::stream) .filter(predicate) .collect(COLLECTOR); return new JobMetrics(filteredMetrics); } @Override public int hashCode() { return metrics.hashCode(); } @Override public boolean equals(Object obj) { if (obj == null || getClass() != obj.getClass()) { return false; } if (obj == this) { return true; } return Objects.equals(metrics, ((JobMetrics) obj).metrics); } @Override public String toString() { StringBuilder sb = new StringBuilder(); metrics.entrySet().stream() .sorted(Comparator.comparing(Entry::getKey)) .forEach( mainEntry -> { sb.append(mainEntry.getKey()).append(":\n"); mainEntry.getValue().stream() .collect( groupingBy( m -> { String vertex = m.tag(MetricTags.TASK_NAME); return vertex == null ? "" : vertex; })) .entrySet() .stream() .sorted(Comparator.comparing(Entry::getKey)) .forEach( e -> { String vertexName = e.getKey(); sb.append(" ").append(vertexName).append(":\n"); e.getValue() .forEach( m -> sb.append(" ") .append(m) .append("\n")); }); }); return sb.toString(); } public String toJsonString() { ObjectMapper objectMapper = new ObjectMapper(); objectMapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); try { return objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(this.metrics); } catch (JsonProcessingException e) { ObjectNode objectNode = objectMapper.createObjectNode(); objectNode.put("err", "serialize JobMetrics err"); return objectNode.toString(); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/Measurement.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import lombok.Data; import java.io.Serializable; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; /** * Immutable data class containing information about one metric measurement, consisting of: * *

      *
    • metric value *
    • metric timestamp, generated when the metric was gathered *
    • metric descriptor (set of tag name-value pairs) *
    * *

    A metrics descriptor can be thought of as a set of attributes associated with a particular * metric, metric which in turn is defined by its name (for a full list of metric names provided see * {@link MetricNames}). The attributes are specified as tags that have names and values (for a full * list of tag names see {@link MetricTags}). An example descriptor would have a collection of * tags/attributes like this: {@code job=jobId, pipeline=pipelineId, unit=count, * metric=SourceReceivedCount, ...} */ @Data public final class Measurement implements Serializable { private Map tags; // tag name -> tag value private String metric; private Object value; private long timestamp; Measurement() {} private Measurement(String metric, Object value, long timestamp, Map tags) { this.metric = metric; this.value = value; this.timestamp = timestamp; this.tags = new HashMap<>(tags); } /** * Builds a {@link Measurement} instance based on timestamp, value and the metric descriptor in * map form. */ public static Measurement of( String metric, Object value, long timestamp, Map tags) { Objects.requireNonNull(tags, "metric"); Objects.requireNonNull(tags, "tags"); return new Measurement(metric, value, timestamp, tags); } /** Returns the value associated with this {@link Measurement}. */ public Object value() { return value; } /** * Returns the timestamps associated with this {@link Measurement}, the moment when the value * was gathered. */ public long timestamp() { return timestamp; } /** Returns the name of the metric. For a list of different metrics see {@link MetricNames}. */ public String metric() { return metric; } /** * Returns the value associated with a specific tag, based on the metric description of this * particular {@link Measurement}. For a list of possible tag names see {@link MetricTags}. */ public String tag(String name) { return tags.get(name); } public Map getTags() { return tags; } @Override public int hashCode() { return 31 * (int) (timestamp * 31 + value.hashCode()) + Objects.hashCode(tags); } @Override public boolean equals(Object obj) { final Measurement that; return this == obj || obj instanceof Measurement && this.timestamp == (that = (Measurement) obj).timestamp && this.value == that.value && Objects.equals(this.tags, that.tags); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(String.format("%s %s", metric, value)).append(" ").append(timestamp).append(" ["); String tags = this.tags.entrySet().stream() .sorted(Comparator.comparing(Map.Entry::getKey)) .map(e -> e.getKey() + "=" + e.getValue()) .collect(Collectors.joining(", ")); sb.append(tags).append(']'); return sb.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/MeasurementPredicates.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import java.util.function.Predicate; import java.util.regex.Pattern; /** Static utility class for creating various {@link Measurement} filtering predicates. */ public final class MeasurementPredicates { private MeasurementPredicates() {} /** * Matches a {@link Measurement} which contain the specified tag. * * @param tag the tag of interest * @return a filtering predicate */ public static Predicate containsTag(String tag) { return measurement -> measurement.tag(tag) != null; } /** * Matches a {@link Measurement} which contains the specified tag and the tag has the specified * value. * * @param tag the tag to match * @param value the value the tag has to have * @return a filtering predicate */ public static Predicate tagValueEquals(String tag, String value) { return measurement -> value.equals(measurement.tag(tag)); } /** * Matches a {@link Measurement} which has this exact tag with a value matching the provided * regular expression. * * @param tag the tag to match * @param valueRegexp regular expression to match the value against * @return a filtering predicate */ public static Predicate tagValueMatches(String tag, String valueRegexp) { return measurement -> { String value = measurement.tag(tag); return value != null && Pattern.compile(valueRegexp).matcher(value).matches(); }; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/Meter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; /** Metric for measuring throughput. */ public interface Meter extends Metric { /** Mark occurrence of an event. */ void markEvent(); /** * Mark occurrence of multiple events. * * @param n number of events occurred */ void markEvent(long n); /** * Returns the current rate of events per second. * * @return current rate of events per second */ double getRate(); /** * Get number of events marked on the meter. * * @return number of events marked on the meter */ long getCount(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/Metric.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import java.io.Serializable; public interface Metric extends Serializable { /** Returns the name of the associated metric. */ String name(); /** * Return the measurement unit for the associated metric. Meant to provide further information * on the type of value measured by the user-defined metric. Doesn't affect the functionality of * the metric, it still remains a simple numeric value, but is used to populate the {@link * MetricTags#UNIT} tag in the metric's description. */ Unit unit(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/MetricNames.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; public final class MetricNames { private MetricNames() {} public static final String RECEIVED_COUNT = "receivedCount"; public static final String RECEIVED_BATCHES = "receivedBatches"; public static final String SOURCE_RECEIVED_COUNT = "SourceReceivedCount"; public static final String SOURCE_RECEIVED_BYTES = "SourceReceivedBytes"; public static final String SOURCE_RECEIVED_QPS = "SourceReceivedQPS"; public static final String SOURCE_RECEIVED_BYTES_PER_SECONDS = "SourceReceivedBytesPerSeconds"; public static final String SINK_WRITE_COUNT = "SinkWriteCount"; public static final String SINK_WRITE_BYTES = "SinkWriteBytes"; public static final String SINK_WRITE_QPS = "SinkWriteQPS"; public static final String SINK_WRITE_BYTES_PER_SECONDS = "SinkWriteBytesPerSeconds"; public static final String SINK_COMMITTED_COUNT = "SinkCommittedCount"; public static final String SINK_COMMITTED_BYTES = "SinkCommittedBytes"; public static final String SINK_COMMITTED_QPS = "SinkCommittedQPS"; public static final String SINK_COMMITTED_BYTES_PER_SECONDS = "SinkCommittedBytesPerSeconds"; public static final String INTERMEDIATE_QUEUE_SIZE = "IntermediateQueueSize"; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/MetricTags.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; public final class MetricTags { private MetricTags() {} public static final String MEMBER = "member"; public static final String ADDRESS = "address"; public static final String JOB_ID = "jobId"; public static final String PIPELINE_ID = "pipelineId"; public static final String TASK_GROUP_ID = "taskGroupId"; public static final String TASK_ID = "taskID"; public static final String UNIT = "unit"; public static final String TASK_NAME = "taskName"; public static final String SERVICE = "service"; public static final String TASK_GROUP_LOCATION = "taskGroupLocation"; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/MetricsContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; public interface MetricsContext { /** * registers a {@link ThreadSafeCounter} with SeaTunnel. * * @param name name of the counter * @return the created counter */ Counter counter(String name); /** * Registers a {@link Counter} with SeaTunnel. * * @param name name of the counter * @param counter counter to register * @param counter type * @return the given counter */ C counter(String name, C counter); /** * Registers a {@link ThreadSafeQPSMeter} with SeaTunnel. * * @param name name of the meter * @return the registered meter */ Meter meter(String name); /** * Registers a new {@link Meter} with SeaTunnel. * * @param name name of the meter * @param meter meter to register * @param meter type * @return the registered meter */ M meter(String name, M meter); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/RawJobMetrics.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import java.io.Serializable; import java.util.Arrays; public final class RawJobMetrics implements Serializable { private long timestamp; private byte[] blob; RawJobMetrics() {} private RawJobMetrics(long timestamp, byte[] blob) { this.timestamp = timestamp; this.blob = blob; } public static RawJobMetrics empty() { return of(null); } public static RawJobMetrics of(byte[] blob) { return new RawJobMetrics(System.currentTimeMillis(), blob); } public long getTimestamp() { return timestamp; } public byte[] getBlob() { return blob; } @Override public int hashCode() { return (int) timestamp * 31 + Arrays.hashCode(blob); } @Override public boolean equals(Object obj) { if (obj == null || getClass() != obj.getClass()) { return false; } if (obj == this) { return true; } RawJobMetrics that; return Arrays.equals(blob, (that = (RawJobMetrics) obj).blob) && this.timestamp == that.timestamp; } @Override public String toString() { return Arrays.toString(blob) + " @ " + timestamp; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeCounter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import java.io.Serializable; import java.util.concurrent.atomic.AtomicLongFieldUpdater; public class ThreadSafeCounter implements Counter, Serializable { private static final long serialVersionUID = 1L; private final String name; private static final AtomicLongFieldUpdater VOLATILE_VALUE_UPDATER = AtomicLongFieldUpdater.newUpdater(ThreadSafeCounter.class, "value"); private volatile long value; public ThreadSafeCounter(String name) { this.name = name; } @Override public void inc() { VOLATILE_VALUE_UPDATER.incrementAndGet(this); } @Override public void inc(long n) { VOLATILE_VALUE_UPDATER.addAndGet(this, n); } @Override public void dec() { VOLATILE_VALUE_UPDATER.decrementAndGet(this); } @Override public void dec(long n) { VOLATILE_VALUE_UPDATER.addAndGet(this, -n); } @Override public void set(long n) { VOLATILE_VALUE_UPDATER.set(this, n); } @Override public long getCount() { return VOLATILE_VALUE_UPDATER.get(this); } @Override public String name() { return name; } @Override public Unit unit() { return Unit.COUNT; } @Override public String toString() { return "ThreadSafeCounter{" + "name='" + name + '\'' + ", value=" + value + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeQPSMeter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; import java.io.Serializable; import java.util.concurrent.atomic.AtomicLongFieldUpdater; public class ThreadSafeQPSMeter implements Meter, Serializable { private static final long serialVersionUID = 1L; private static final AtomicLongFieldUpdater VOLATILE_VALUE_UPDATER = AtomicLongFieldUpdater.newUpdater(ThreadSafeQPSMeter.class, "value"); private final String name; private volatile long value; private final long timestamp; public ThreadSafeQPSMeter(String name) { this.name = name; timestamp = System.currentTimeMillis(); } @Override public void markEvent() { VOLATILE_VALUE_UPDATER.incrementAndGet(this); } @Override public void markEvent(long n) { VOLATILE_VALUE_UPDATER.addAndGet(this, n); } @Override public double getRate() { long cost = System.currentTimeMillis() - timestamp; return (double) value * 1000 / cost; } @Override public long getCount() { return VOLATILE_VALUE_UPDATER.get(this); } @Override public String name() { return name; } @Override public Unit unit() { return Unit.COUNT; } @Override public String toString() { return "ThreadSafeQPSMeter{" + "name='" + name + '\'' + ", value=" + value + ", timestamp=" + timestamp + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/Unit.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.common.metrics; public enum Unit { /** Size, counter, represented in bytes */ BYTES, /** Timestamp or duration represented in ms */ MS, /** An integer in range 0..100 */ PERCENT, /** Number of items: size, counter... */ COUNT, /** 0 or 1 */ BOOLEAN, /** 0..n, ordinal of an enum */ ENUM, } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigAdapter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import java.nio.file.Path; import java.util.Map; /** Adapter mode to support convert other config to HOCON. */ public interface ConfigAdapter { /** * Provides the config file extension identifier supported by the adapter. * * @return Extension identifier. */ String[] extensionIdentifiers(); /** * Converter config file to path_key-value Map in HOCON * * @param configFilePath config file path. * @return Map */ Map loadConfig(Path configFilePath); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ConfigShade.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import java.util.Map; /** * The interface that provides the ability to encrypt and decrypt {@link * org.apache.seatunnel.shade.com.typesafe.config.Config} */ public interface ConfigShade { /** * The unique identifier of the current interface, used it to select the correct {@link * ConfigShade} */ String getIdentifier(); /** * Encrypt the content * * @param content The content to encrypt */ String encrypt(String content); /** * Decrypt the content * * @param content The content to decrypt */ String decrypt(String content); /** To expand the options that user want to encrypt */ default String[] sensitiveOptions() { return new String[0]; } /** * this method will be called before the encrypt/decrpyt method. Users can use the props to * control the behavior of the encrypt/decrypt * * @param props the additional properties defined with the key `shade.props` in the * configuration */ default void open(Map props) { // default do nothing } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Option.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import lombok.Getter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; public class Option { /** The current key for that config option. */ private final String key; /** Type of the value that this Option describes. */ private final TypeReference typeReference; /** The default value for this option. */ private final T defaultValue; /** The description for this option. */ String description = ""; @Getter private final List fallbackKeys; public Option(String key, TypeReference typeReference, T defaultValue) { this.key = key; this.typeReference = typeReference; this.defaultValue = defaultValue; this.fallbackKeys = new ArrayList<>(); } public String key() { return key; } public TypeReference typeReference() { return typeReference; } public T defaultValue() { return defaultValue; } public String getDescription() { return description; } public Option withDescription(String description) { this.description = description; return this; } public Option withFallbackKeys(String... fallbackKeys) { this.fallbackKeys.addAll(Arrays.asList(fallbackKeys)); return this; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof Option)) { return false; } Option that = (Option) obj; return Objects.equals(this.key, that.key) && Objects.equals(this.defaultValue, that.defaultValue) && Objects.equals(this.fallbackKeys, that.fallbackKeys); } @Override public int hashCode() { return Objects.hash(this.key, this.defaultValue, this.fallbackKeys); } @Override public String toString() { return String.format( "Key: '%s', default: %s (fallback keys: %s)", key, defaultValue, fallbackKeys); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import lombok.NonNull; import java.lang.reflect.ParameterizedType; import java.lang.reflect.Type; import java.math.BigDecimal; import java.time.Duration; import java.util.List; import java.util.Map; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class Options { /** * Starts building a new {@link Option}. * * @param key The key for the config option. * @return The builder for the config option with the given key. */ public static OptionBuilder key(String key) { checkArgument(StringUtils.isNotBlank(key), "Option's key not be null."); return new OptionBuilder(key); } /** * The option builder is used to create a {@link Option}. It is instantiated via {@link * Options#key(String)}. */ public static final class OptionBuilder { private final String key; /** * Creates a new OptionBuilder. * * @param key The key for the config option */ OptionBuilder(String key) { this.key = key; } /** Defines that the value of the option should be of {@link Boolean} type. */ public TypedOptionBuilder booleanType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link Integer} type. */ public TypedOptionBuilder intType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link Long} type. */ public TypedOptionBuilder longType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link BigDecimal} type. */ public TypedOptionBuilder bigDecimalType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link Float} type. */ public TypedOptionBuilder floatType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link Double} type. */ public TypedOptionBuilder doubleType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link String} type. */ public TypedOptionBuilder stringType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** Defines that the value of the option should be of {@link Duration} type. */ public TypedOptionBuilder durationType() { return new TypedOptionBuilder<>(key, new TypeReference() {}); } /** * Defines that the value of the option should be of {@link Enum} type. * * @param enumClass Concrete type of the expected enum. */ public > TypedOptionBuilder enumType(Class enumClass) { return new TypedOptionBuilder<>( key, new TypeReference() { @Override public Type getType() { return enumClass; } }); } /** * Defines that the value of the option should be a set of properties, which can be * represented as {@code Map}. */ public TypedOptionBuilder> mapType() { return new TypedOptionBuilder<>(key, new TypeReference>() {}); } /** * Defines that the value of the option should be a set of properties, which can be * represented as {@code Map}. */ public TypedOptionBuilder> mapObjectType() { return new TypedOptionBuilder<>(key, new TypeReference>() {}); } /** * Defines that the value of the option should be a list of properties, which can be * represented as {@code List}. */ public TypedOptionBuilder> listType() { return new TypedOptionBuilder<>(key, new TypeReference>() {}); } /** * Defines that the value of the option should be a list of properties, which can be * represented as {@code List}. */ public TypedOptionBuilder> listType(Class subClass) { return new TypedOptionBuilder<>( key, new TypeReference>() { @Override public Type getType() { return new ParameterizedType() { @Override public Type[] getActualTypeArguments() { return new Type[] {subClass}; } @Override public Type getRawType() { return List.class; } @Override public Type getOwnerType() { return null; } }; } }); } public TypedOptionBuilder objectType(Class option) { return new TypedOptionBuilder<>( key, new TypeReference() { @Override public Type getType() { return option; } }); } /** Construct an option with multiple options and only one of them can be selected */ public SingleChoiceOptionBuilder singleChoice( @NonNull Class optionType, @NonNull List optionValues) { return new SingleChoiceOptionBuilder( key, new TypeReference() { @Override public Type getType() { return optionType; } }, optionValues); } /** * The value of the definition option should be represented as T. * * @param typeReference complex type reference */ public TypedOptionBuilder type(TypeReference typeReference) { return new TypedOptionBuilder<>(key, typeReference); } } /** * Builder for {@link Option} with a defined atomic type. * * @param atomic type of the option */ public static class TypedOptionBuilder { private final String key; private final TypeReference typeReference; TypedOptionBuilder(String key, TypeReference typeReference) { this.key = key; this.typeReference = typeReference; } /** * Creates a Option with the given default value. * * @param value The default value for the config option * @return The config option with the default value. */ public Option defaultValue(T value) { return new Option<>(key, typeReference, value); } /** * Creates a Option without a default value. * * @return The config option without a default value. */ public Option noDefaultValue() { return new Option<>(key, typeReference, null); } } public static class SingleChoiceOptionBuilder { private final List optionValues; private final String key; private final TypeReference typeReference; SingleChoiceOptionBuilder(String key, TypeReference typeReference, List optionValues) { this.optionValues = optionValues; this.key = key; this.typeReference = typeReference; } /** * Creates a Option with the given default value. * * @param value The default value for the config option * @return The config option with the default value. */ public SingleChoiceOption defaultValue(T value) { return new SingleChoiceOption(key, typeReference, optionValues, value); } /** * Creates a Option without a default value. * * @return The config option without a default value. */ public SingleChoiceOption noDefaultValue() { return new SingleChoiceOption(key, typeReference, optionValues, null); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/ReadonlyConfig.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.JsonProcessingException; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.shade.com.typesafe.config.ConfigRenderOptions; import lombok.extern.slf4j.Slf4j; import java.io.Serializable; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; import java.util.Optional; import static org.apache.seatunnel.api.configuration.util.ConfigUtil.convertToJsonString; import static org.apache.seatunnel.api.configuration.util.ConfigUtil.convertValue; @Slf4j public class ReadonlyConfig implements Serializable { private static final long serialVersionUID = 1L; private static final ObjectMapper JACKSON_MAPPER = new ObjectMapper(); /** Stores the concrete key/value pairs of this configuration object. */ protected final Map confData; private ReadonlyConfig(Map confData) { this.confData = confData; } public static ReadonlyConfig fromMap(Map map) { return new ReadonlyConfig(map); } public static ReadonlyConfig fromConfig(Config config) { try { return fromMap( JACKSON_MAPPER.readValue( config.root().render(ConfigRenderOptions.concise()), new TypeReference>() {})); } catch (JsonProcessingException e) { throw new IllegalArgumentException("Json parsing exception.", e); } } public T get(Option option) { return getOptional(option).orElseGet(option::defaultValue); } /** * Transform to Config todo: This method should be removed after we remove Config * * @return Config * @deprecated Please use ReadonlyConfig directly */ @Deprecated public Config toConfig() { return ConfigFactory.parseMap(confData); } public Map toMap() { if (confData.isEmpty()) { return Collections.emptyMap(); } Map result = new LinkedHashMap<>(); toMap(result); return result; } public void toMap(Map result) { if (confData.isEmpty()) { return; } for (Map.Entry entry : confData.entrySet()) { result.put(entry.getKey(), convertToJsonString(entry.getValue())); } } public Map getSourceMap() { return confData; } public Optional getOptional(Option option) { if (option == null) { throw new NullPointerException("Option not be null."); } Object value = getValue(option.key()); if (value == null) { for (String fallbackKey : option.getFallbackKeys()) { value = getValue(fallbackKey); if (value != null) { log.warn( "Please use the new key '{}' instead of the deprecated key '{}'.", option.key(), fallbackKey); break; } } } if (value == null) { return Optional.empty(); } return Optional.of(convertValue(value, option)); } private Object getValue(String key) { if (this.confData.containsKey(key)) { return this.confData.get(key); } else { String[] keys = key.split("\\."); Map data = this.confData; Object value = null; for (int i = 0; i < keys.length; i++) { value = data.get(keys[i]); if (i < keys.length - 1) { if (!(value instanceof Map)) { return null; } else { data = (Map) value; } } } return value; } } @Override public int hashCode() { int hash = 0; for (String s : this.confData.keySet()) { hash ^= s.hashCode(); } return hash; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof ReadonlyConfig)) { return false; } Map otherConf = ((ReadonlyConfig) obj).confData; return this.confData.equals(otherConf); } @Override public String toString() { return convertToJsonString(this.confData); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/SingleChoiceOption.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import lombok.Getter; import java.util.List; public class SingleChoiceOption extends Option { @Getter private final List optionValues; public SingleChoiceOption( String key, TypeReference typeReference, List optionValues, T defaultValue) { super(key, typeReference, defaultValue); this.optionValues = optionValues; } @Override public SingleChoiceOption withDescription(String description) { this.description = description; return this; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/Condition.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import java.util.Objects; public class Condition { private final Option option; private final T expectValue; private Boolean and = null; private Condition next = null; Condition(Option option, T expectValue) { this.option = option; this.expectValue = expectValue; } public static Condition of(Option option, T expectValue) { return new Condition<>(option, expectValue); } public Condition and(Option option, E expectValue) { return and(of(option, expectValue)); } public Condition or(Option option, E expectValue) { return or(of(option, expectValue)); } public Condition and(Condition next) { addCondition(true, next); return this; } public Condition or(Condition next) { addCondition(false, next); return this; } private void addCondition(boolean and, Condition next) { Condition tail = getTailCondition(); tail.and = and; tail.next = next; } protected int getCount() { int i = 1; Condition cur = this; while (cur.hasNext()) { i++; cur = cur.next; } return i; } Condition getTailCondition() { return hasNext() ? this.next.getTailCondition() : this; } public boolean hasNext() { return this.next != null; } public Condition getNext() { return this.next; } public Option getOption() { return option; } public T getExpectValue() { return expectValue; } public Boolean and() { return this.and; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof Condition)) { return false; } Condition that = (Condition) obj; return Objects.equals(this.option, that.option) && Objects.equals(this.expectValue, that.expectValue) && Objects.equals(this.and, that.and) && Objects.equals(this.next, that.next); } @Override public int hashCode() { return Objects.hash(this.option, this.expectValue, this.and, this.next); } @Override public String toString() { Condition cur = this; StringBuilder builder = new StringBuilder(); boolean bracket = false; do { builder.append("'") .append(cur.option.key()) // TODO: support another condition .append("' == ") .append(cur.expectValue); if (bracket) { builder = new StringBuilder(String.format("(%s)", builder)); bracket = false; } if (cur.hasNext()) { if (cur.next.hasNext() && !cur.and.equals(cur.next.and)) { bracket = true; } builder.append(cur.and ? " && " : " || "); } cur = cur.next; } while (cur != null); return builder.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/ConfigUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.JsonProcessingException; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.api.configuration.Option; import lombok.extern.slf4j.Slf4j; import java.lang.reflect.ParameterizedType; import java.util.Arrays; import java.util.List; import java.util.Locale; import java.util.stream.Collectors; @Slf4j public class ConfigUtil { private static final ObjectMapper JACKSON_MAPPER = new ObjectMapper(); @SuppressWarnings("unchecked") public static T convertValue(Object rawValue, Option option) { TypeReference typeReference = option.typeReference(); if (typeReference.getType() instanceof Class) { // simple type Class clazz = (Class) typeReference.getType(); if (clazz.equals(rawValue.getClass())) { return (T) rawValue; } try { return convertValue(rawValue, clazz); } catch (IllegalArgumentException e) { // Continue with Jackson parsing } } try { // complex type && untreated type return JACKSON_MAPPER.readValue(convertToJsonString(rawValue), typeReference); } catch (JsonProcessingException e) { if (typeReference.getType() instanceof ParameterizedType && List.class.equals( ((ParameterizedType) typeReference.getType()).getRawType())) { try { log.warn( "Option '{}' is a List, and it is recommended to configure it as [\"string1\",\"string2\"]; we will only use ',' to split the String into a list.", option.key()); return (T) convertToList( rawValue, (Class) ((ParameterizedType) typeReference.getType()) .getActualTypeArguments()[0]); } catch (Exception ignore) { // nothing } } throw new IllegalArgumentException( String.format( "Json parsing exception, value '%s', and expected type '%s'", rawValue, typeReference.getType().getTypeName()), e); } } static List convertToList(Object rawValue, Class clazz) { if (rawValue instanceof List) { return ((List) rawValue) .stream() .map(value -> convertValue(convertToJsonString(value), clazz)) .collect(Collectors.toList()); } return Arrays.stream(rawValue.toString().split(",")) .map(String::trim) .map(value -> convertValue(value, clazz)) .collect(Collectors.toList()); } @SuppressWarnings("unchecked") static T convertValue(Object rawValue, Class clazz) { if (Boolean.class.equals(clazz)) { return (T) convertToBoolean(rawValue); } else if (clazz.isEnum()) { return (T) convertToEnum(rawValue, (Class>) clazz); } else if (String.class.equals(clazz)) { return (T) convertToJsonString(rawValue); } else if (Integer.class.equals(clazz)) { return (T) convertToInt(rawValue); } else if (Long.class.equals(clazz)) { return (T) convertToLong(rawValue); } else if (Float.class.equals(clazz)) { return (T) convertToFloat(rawValue); } else if (Double.class.equals(clazz)) { return (T) convertToDouble(rawValue); } else if (Object.class.equals(clazz)) { return (T) rawValue; } throw new IllegalArgumentException("Unsupported type: " + clazz); } static Integer convertToInt(Object o) { if (o.getClass() == Integer.class) { return (Integer) o; } else if (o.getClass() == Long.class) { long value = (Long) o; if (value <= Integer.MAX_VALUE && value >= Integer.MIN_VALUE) { return (int) value; } else { throw new IllegalArgumentException( String.format( "Configuration value %s overflows/underflows the integer type.", value)); } } return Integer.parseInt(o.toString()); } static Long convertToLong(Object o) { if (o.getClass() == Long.class) { return (Long) o; } else if (o.getClass() == Integer.class) { return ((Integer) o).longValue(); } return Long.parseLong(o.toString()); } static Float convertToFloat(Object o) { if (o.getClass() == Float.class) { return (Float) o; } else if (o.getClass() == Double.class) { double value = ((Double) o); if (value == 0.0 || (value >= Float.MIN_VALUE && value <= Float.MAX_VALUE) || (value >= -Float.MAX_VALUE && value <= -Float.MIN_VALUE)) { return (float) value; } else { throw new IllegalArgumentException( String.format( "Configuration value %s overflows/underflows the float type.", value)); } } return Float.parseFloat(o.toString()); } static Double convertToDouble(Object o) { if (o.getClass() == Double.class) { return (Double) o; } else if (o.getClass() == Float.class) { return ((Float) o).doubleValue(); } return Double.parseDouble(o.toString()); } static Boolean convertToBoolean(Object o) { switch (o.toString().toUpperCase()) { case "TRUE": return true; case "FALSE": return false; default: throw new IllegalArgumentException( String.format( "Unrecognized option for boolean: %s. Expected either true or false(case insensitive)", o)); } } static > E convertToEnum(Object o, Class clazz) { return Arrays.stream(clazz.getEnumConstants()) .filter( e -> e.toString() .toUpperCase(Locale.ROOT) .equals(o.toString().toUpperCase(Locale.ROOT))) .findAny() .orElseThrow( () -> new IllegalArgumentException( String.format( "Could not parse value for enum %s. Expected one of: [%s]", clazz, Arrays.toString(clazz.getEnumConstants())))); } public static String convertToJsonString(Object o) { if (o == null) { return null; } if (o instanceof String) { return (String) o; } try { return JACKSON_MAPPER.writeValueAsString(o); } catch (JsonProcessingException e) { throw new IllegalArgumentException(String.format("Could not parse json, value: %s", o)); } } public static String convertToJsonString(Config config) { return convertToJsonString(config.root().unwrapped()); } public static Config convertToConfig(String configJson) { return ConfigFactory.parseString(configJson); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/ConfigValidator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.configuration.SingleChoiceOption; import org.apache.commons.collections4.CollectionUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import static org.apache.seatunnel.api.configuration.util.OptionUtil.getOptionKeys; public class ConfigValidator { private final ReadonlyConfig config; private ConfigValidator(ReadonlyConfig config) { this.config = config; } public static ConfigValidator of(ReadonlyConfig config) { return new ConfigValidator(config); } public void validate(OptionRule rule) { List requiredOptions = rule.getRequiredOptions(); for (RequiredOption requiredOption : requiredOptions) { validate(requiredOption); for (Option option : requiredOption.getOptions()) { if (SingleChoiceOption.class.isAssignableFrom(option.getClass())) { // is required option and not match condition, skip validate if (isConditionOption(requiredOption) && !matchCondition( (RequiredOption.ConditionalRequiredOptions) requiredOption)) { continue; } validateSingleChoice(option); } } } for (Option option : rule.getOptionalOptions()) { if (SingleChoiceOption.class.isAssignableFrom(option.getClass())) { validateSingleChoice(option); } } } void validateSingleChoice(Option option) { SingleChoiceOption singleChoiceOption = (SingleChoiceOption) option; List optionValues = singleChoiceOption.getOptionValues(); if (CollectionUtils.isEmpty(optionValues)) { throw new OptionValidationException( "These options(%s) are SingleChoiceOption, the optionValues must not be null.", getOptionKeys(Arrays.asList(singleChoiceOption))); } Object o = singleChoiceOption.defaultValue(); if (o != null && !optionValues.contains(o)) { throw new OptionValidationException( "These options(%s) are SingleChoiceOption, the defaultValue(%s) must be one of the optionValues(%s).", getOptionKeys(Arrays.asList(singleChoiceOption)), o, optionValues); } Object value = config.get(option); if (value != null && !optionValues.contains(value)) { throw new OptionValidationException( "These options(%s) are SingleChoiceOption, the value(%s) must be one of the optionValues(%s).", getOptionKeys(Arrays.asList(singleChoiceOption)), value, optionValues); } } void validate(RequiredOption requiredOption) { if (requiredOption instanceof RequiredOption.AbsolutelyRequiredOptions) { validate((RequiredOption.AbsolutelyRequiredOptions) requiredOption); return; } if (requiredOption instanceof RequiredOption.BundledRequiredOptions) { validate((RequiredOption.BundledRequiredOptions) requiredOption); return; } if (requiredOption instanceof RequiredOption.ExclusiveRequiredOptions) { validate((RequiredOption.ExclusiveRequiredOptions) requiredOption); return; } if (isConditionOption(requiredOption)) { validate((RequiredOption.ConditionalRequiredOptions) requiredOption); return; } throw new UnsupportedOperationException( String.format( "This type option(%s) of validation is not supported", requiredOption.getClass())); } private List> getAbsentOptions(List> requiredOption) { List> absent = new ArrayList<>(); for (Option option : requiredOption) { // If the required option have default values, we will take the default values if (!hasOption(option) && option.defaultValue() == null) { absent.add(option); } } return absent; } void validate(RequiredOption.AbsolutelyRequiredOptions requiredOption) { List> absentOptions = getAbsentOptions(requiredOption.getRequiredOption()); if (absentOptions.size() == 0) { return; } throw new OptionValidationException( "There are unconfigured options, the options(%s) are required.", getOptionKeys(absentOptions)); } boolean hasOption(Option option) { return config.getOptional(option).isPresent(); } boolean validate(RequiredOption.BundledRequiredOptions bundledRequiredOptions) { List> bundledOptions = bundledRequiredOptions.getRequiredOption(); List> present = new ArrayList<>(); List> absent = new ArrayList<>(); for (Option option : bundledOptions) { if (hasOption(option)) { present.add(option); } else { absent.add(option); } } if (present.size() == bundledOptions.size()) { return true; } if (absent.size() == bundledOptions.size()) { return false; } throw new OptionValidationException( "These options(%s) are bundled, must be present or absent together. The options present are: %s. The options absent are %s.", getOptionKeys(bundledOptions), getOptionKeys(present), getOptionKeys(absent)); } void validate(RequiredOption.ExclusiveRequiredOptions exclusiveRequiredOptions) { List> presentOptions = new ArrayList<>(); for (Option option : exclusiveRequiredOptions.getExclusiveOptions()) { if (hasOption(option)) { presentOptions.add(option); } } int count = presentOptions.size(); if (count == 1) { return; } if (count == 0) { throw new OptionValidationException( "There are unconfigured options, these options(%s) are mutually exclusive, allowing only one set(\"[] for a set\") of options to be configured.", getOptionKeys(exclusiveRequiredOptions.getExclusiveOptions())); } if (count > 1) { throw new OptionValidationException( "These options(%s) are mutually exclusive, allowing only one set(\"[] for a set\") of options to be configured.", getOptionKeys(presentOptions)); } } void validate(RequiredOption.ConditionalRequiredOptions conditionalRequiredOptions) { boolean match = matchCondition(conditionalRequiredOptions); if (!match) { return; } List> absentOptions = getAbsentOptions(conditionalRequiredOptions.getRequiredOption()); if (absentOptions.size() == 0) { return; } throw new OptionValidationException( "There are unconfigured options, the options(%s) are required because [%s] is true.", getOptionKeys(absentOptions), conditionalRequiredOptions.getExpression().toString()); } private boolean validate(Expression expression) { Condition condition = expression.getCondition(); boolean match = validate(condition); if (!expression.hasNext()) { return match; } if (expression.and()) { return match && validate(expression.getNext()); } else { return match || validate(expression.getNext()); } } private boolean validate(Condition condition) { Option option = condition.getOption(); boolean match = Objects.equals(condition.getExpectValue(), config.get(option)); if (!condition.hasNext()) { return match; } if (condition.and()) { return match && validate(condition.getNext()); } else { return match || validate(condition.getNext()); } } private boolean isConditionOption(RequiredOption requiredOption) { return requiredOption instanceof RequiredOption.ConditionalRequiredOptions; } private boolean matchCondition( RequiredOption.ConditionalRequiredOptions conditionalRequiredOptions) { Expression expression = conditionalRequiredOptions.getExpression(); return validate(expression); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/Expression.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import java.util.Objects; public class Expression { private final Condition condition; private Boolean and = null; private Expression next = null; Expression(Condition condition) { this.condition = condition; } public static Expression of(Option option, T expectValue) { return new Expression(Condition.of(option, expectValue)); } public static Expression of(Condition condition) { return new Expression(condition); } public Expression and(Expression next) { addExpression(true, next); return this; } public Expression or(Expression next) { addExpression(false, next); return this; } private void addExpression(boolean and, Expression next) { Expression tail = getTailExpression(); tail.and = and; tail.next = next; } private Expression getTailExpression() { return hasNext() ? this.next.getTailExpression() : this; } public Condition getCondition() { return condition; } public boolean hasNext() { return this.next != null; } public Expression getNext() { return this.next; } public Boolean and() { return this.and; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof Expression)) { return false; } Expression that = (Expression) obj; return Objects.equals(this.condition, that.condition) && Objects.equals(this.and, that.and) && Objects.equals(this.next, that.next); } @Override public int hashCode() { return Objects.hash(this.condition, this.and, this.next); } @Override public String toString() { Expression cur = this; StringBuilder builder = new StringBuilder(); boolean bracket = false; do { if (cur.condition.getCount() > 1) { builder.append("(").append(cur.condition).append(")"); } else { builder.append(cur.condition); } if (bracket) { builder = new StringBuilder(String.format("(%s)", builder)); bracket = false; } if (cur.hasNext()) { if (cur.next.hasNext() && !cur.and.equals(cur.next.and)) { bracket = true; } builder.append(cur.and ? " && " : " || "); } cur = cur.next; } while (cur != null); return builder.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/OptionMark.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import java.lang.annotation.Documented; import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; @Retention(RetentionPolicy.RUNTIME) @Documented @Target(ElementType.FIELD) public @interface OptionMark { /** * The key of the option, if not configured, we will default convert `lowerCamelCase` to * `under_score_case` and provide it to users */ String name() default ""; /** The description of the option */ String description() default ""; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/OptionRule.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import lombok.NonNull; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.concurrent.atomic.AtomicBoolean; /** * Validation rule for {@link Option}. * *

    The option rule is typically built in one of the following pattern: * *

    {@code
     * // simple rule
     * OptionRule simpleRule = OptionRule.builder()
     *     .optional(POLL_TIMEOUT, POLL_INTERVAL)
     *     .required(CLIENT_SERVICE_URL)
     *     .build();
     *
     * // basic full rule
     * OptionRule fullRule = OptionRule.builder()
     *     .optional(POLL_TIMEOUT, POLL_INTERVAL, CURSOR_STARTUP_MODE)
     *     .required(CLIENT_SERVICE_URL, ADMIN_SERVICE_URL)
     *     .exclusive(TOPIC_PATTERN, TOPIC)
     *     .conditional(CURSOR_STARTUP_MODE, StartMode.TIMESTAMP, CURSOR_STARTUP_TIMESTAMP)
     *     .build();
     *
     * // complex conditional rule
     * // moot expression
     * Expression expression = Expression.of(TOPIC_DISCOVERY_INTERVAL, 200)
     *     .and(Expression.of(Condition.of(CURSOR_STARTUP_MODE, StartMode.EARLIEST)
     *         .or(CURSOR_STARTUP_MODE, StartMode.LATEST)))
     *     .or(Expression.of(Condition.of(TOPIC_DISCOVERY_INTERVAL, 100)))
     *
     * OptionRule complexRule = OptionRule.builder()
     *     .optional(POLL_TIMEOUT, POLL_INTERVAL, CURSOR_STARTUP_MODE)
     *     .required(CLIENT_SERVICE_URL, ADMIN_SERVICE_URL)
     *     .exclusive(TOPIC_PATTERN, TOPIC)
     *     .conditional(expression, CURSOR_RESET_MODE)
     *     .build();
     * }
    */ public class OptionRule { /** * Optional options with default value. * *

    This options will not be validated. * *

    This is used by the web-UI to show what options are available. */ private final List> optionalOptions; /** * Required options with no default value. * *

    Verify that the option is valid through the defined rules. */ private final List requiredOptions; OptionRule(List> optionalOptions, List requiredOptions) { this.optionalOptions = optionalOptions; this.requiredOptions = requiredOptions; } public List> getOptionalOptions() { return optionalOptions; } public List getRequiredOptions() { return requiredOptions; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof OptionRule)) { return false; } OptionRule that = (OptionRule) o; return Objects.equals(optionalOptions, that.optionalOptions) && Objects.equals(requiredOptions, that.requiredOptions); } @Override public int hashCode() { return Objects.hash(optionalOptions, requiredOptions); } public static OptionRule.Builder builder() { return new OptionRule.Builder(); } /** Builder for {@link OptionRule}. */ public static class Builder { private final List> optionalOptions = new ArrayList<>(); private final List requiredOptions = new ArrayList<>(); private Builder() {} /** * Optional options * *

    This options will not be validated. * *

    This is used by the web-UI to show what options are available. */ public Builder optional(@NonNull Option... options) { for (Option option : options) { verifyOptionOptionsDuplicate(option, "OptionsOption"); } this.optionalOptions.addAll(Arrays.asList(options)); return this; } /** Absolutely required options without any constraints. */ public Builder required(@NonNull Option... options) { RequiredOption.AbsolutelyRequiredOptions requiredOption = RequiredOption.AbsolutelyRequiredOptions.of(options); verifyRequiredOptionDuplicate(requiredOption); this.requiredOptions.add(requiredOption); return this; } /** Exclusive options, only one of the options needs to be configured. */ public Builder exclusive(@NonNull Option... options) { if (options.length <= 1) { throw new OptionValidationException( "The number of exclusive options must be greater than 1."); } RequiredOption.ExclusiveRequiredOptions exclusiveRequiredOption = RequiredOption.ExclusiveRequiredOptions.of(options); verifyRequiredOptionDuplicate(exclusiveRequiredOption); this.requiredOptions.add(exclusiveRequiredOption); return this; } public Builder conditional( @NonNull Option conditionalOption, @NonNull List expectValues, @NonNull Option... requiredOptions) { verifyConditionalExists(conditionalOption); if (expectValues.isEmpty()) { throw new OptionValidationException( String.format( "conditional option '%s' must have expect values .", conditionalOption.key())); } /** Each parameter can only be controlled by one other parameter */ Expression expression = Expression.of(Condition.of(conditionalOption, expectValues.get(0))); for (int i = 0; i < expectValues.size(); i++) { if (i != 0) { expression = expression.or( Expression.of( Condition.of(conditionalOption, expectValues.get(i)))); } } RequiredOption.ConditionalRequiredOptions option = RequiredOption.ConditionalRequiredOptions.of( expression, new ArrayList<>(Arrays.asList(requiredOptions))); verifyRequiredOptionDuplicate(option, true); this.requiredOptions.add(option); return this; } public Builder conditional( @NonNull Option conditionalOption, @NonNull T expectValue, @NonNull Option... requiredOptions) { verifyConditionalExists(conditionalOption); /** Each parameter can only be controlled by one other parameter */ Expression expression = Expression.of(Condition.of(conditionalOption, expectValue)); RequiredOption.ConditionalRequiredOptions conditionalRequiredOption = RequiredOption.ConditionalRequiredOptions.of( expression, new ArrayList<>(Arrays.asList(requiredOptions))); verifyRequiredOptionDuplicate(conditionalRequiredOption, true); this.requiredOptions.add(conditionalRequiredOption); return this; } /** Bundled options, must be present or absent together. */ public Builder bundled(@NonNull Option... requiredOptions) { RequiredOption.BundledRequiredOptions bundledRequiredOption = RequiredOption.BundledRequiredOptions.of(requiredOptions); verifyRequiredOptionDuplicate(bundledRequiredOption); this.requiredOptions.add(bundledRequiredOption); return this; } public OptionRule build() { return new OptionRule(optionalOptions, requiredOptions); } private void verifyRequiredOptionDefaultValue(@NonNull Option option) { if (option.defaultValue() != null) { throw new OptionValidationException( String.format( "Required option '%s' should have no default value.", option.key())); } } private void verifyDuplicateWithOptionOptions( @NonNull Option option, @NonNull String currentOptionType) { if (optionalOptions.contains(option)) { throw new OptionValidationException( String.format( "%s '%s' duplicate in option options.", currentOptionType, option.key())); } } private void verifyRequiredOptionDuplicate(@NonNull RequiredOption requiredOption) { verifyRequiredOptionDuplicate(requiredOption, false); } /** * Verifies if there are duplicate options within the required options. * * @param requiredOption The required option to be verified * @param ignoreVerifyDuplicateOptions Whether to ignore duplicate option verification If * the value is true, the existing items in OptionOptions are ignored Currently, it * applies only to conditional * @throws OptionValidationException If duplicate options are found */ private void verifyRequiredOptionDuplicate( @NonNull RequiredOption requiredOption, @NonNull Boolean ignoreVerifyDuplicateOptions) { requiredOption .getOptions() .forEach( option -> { if (!ignoreVerifyDuplicateOptions) { // Check if required option that duplicate with option options verifyDuplicateWithOptionOptions( option, requiredOption.getClass().getSimpleName()); } requiredOptions.forEach( ro -> { if (ro instanceof RequiredOption .ConditionalRequiredOptions && requiredOption instanceof RequiredOption .ConditionalRequiredOptions) { Option requiredOptionCondition = ((RequiredOption.ConditionalRequiredOptions) requiredOption) .getExpression() .getCondition() .getOption(); Option roOptionCondition = ((RequiredOption.ConditionalRequiredOptions) ro) .getExpression() .getCondition() .getOption(); if (ro.getOptions().contains(option) && !requiredOptionCondition.equals( roOptionCondition)) { throw new OptionValidationException( String.format( "%s '%s' duplicate in %s options.", requiredOption .getClass() .getSimpleName(), option.key(), ro.getClass().getSimpleName())); } } else { if (ro.getOptions().contains(option)) { throw new OptionValidationException( String.format( "%s '%s' duplicate in %s options.", requiredOption .getClass() .getSimpleName(), option.key(), ro.getClass().getSimpleName())); } } }); }); } private void verifyOptionOptionsDuplicate( @NonNull Option option, @NonNull String currentOptionType) { verifyDuplicateWithOptionOptions(option, currentOptionType); requiredOptions.forEach( requiredOption -> { if (requiredOption.getOptions().contains(option)) { throw new OptionValidationException( String.format( "%s '%s' duplicate in '%s'.", currentOptionType, option.key(), requiredOption.getClass().getSimpleName())); } }); } private void verifyConditionalExists(@NonNull Option option) { boolean inOptions = optionalOptions.contains(option); AtomicBoolean inRequired = new AtomicBoolean(false); requiredOptions.forEach( requiredOption -> { if (requiredOption.getOptions().contains(option)) { inRequired.set(true); } }); if (!inOptions && !inRequired.get()) { throw new OptionValidationException( String.format("Conditional '%s' not found in options.", option.key())); } } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/OptionUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.configuration.Option; import java.lang.reflect.Field; import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class OptionUtil { private OptionUtil() {} public static String getOptionKeys(List> options) { StringBuilder builder = new StringBuilder(); boolean flag = false; for (Option option : options) { if (flag) { builder.append(", "); } builder.append("'").append(option.key()).append("'"); flag = true; } return builder.toString(); } public static String getOptionKeys( List> options, List bundledOptions) { List>> optionList = new ArrayList<>(); for (Option option : options) { optionList.add(Collections.singletonList(option)); } for (RequiredOption.BundledRequiredOptions bundledOption : bundledOptions) { optionList.add(bundledOption.getRequiredOption()); } boolean flag = false; StringBuilder builder = new StringBuilder(); for (List> optionSet : optionList) { if (flag) { builder.append(", "); } builder.append("[").append(getOptionKeys(optionSet)).append("]"); flag = true; } return builder.toString(); } public static List> getOptions(Class clazz) throws InstantiationException, IllegalAccessException { Field[] fields = clazz.getDeclaredFields(); List> options = new ArrayList<>(); Object object = clazz.newInstance(); for (Field field : fields) { field.setAccessible(true); OptionMark option = field.getAnnotation(OptionMark.class); if (option != null) { options.add( new Option<>( !StringUtils.isNotBlank(option.name()) ? formatUnderScoreCase(field.getName()) : option.name(), new TypeReference() { @Override public Type getType() { return field.getType(); } }, field.get(object)) .withDescription(option.description())); } } return options; } private static String formatUnderScoreCase(String camel) { StringBuilder underScore = new StringBuilder(String.valueOf(Character.toLowerCase(camel.charAt(0)))); for (int i = 1; i < camel.length(); i++) { char c = camel.charAt(i); underScore.append(Character.isLowerCase(c) ? c : "_" + Character.toLowerCase(c)); } return underScore.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/OptionValidationException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; /** Exception for all errors occurring during option validation phase. */ public class OptionValidationException extends SeaTunnelRuntimeException { public OptionValidationException(String message, Throwable cause) { super(SeaTunnelAPIErrorCode.OPTION_VALIDATION_FAILED, message, cause); } public OptionValidationException(String message) { super(SeaTunnelAPIErrorCode.OPTION_VALIDATION_FAILED, message); } public OptionValidationException(String formatMessage, Object... args) { super(SeaTunnelAPIErrorCode.OPTION_VALIDATION_FAILED, String.format(formatMessage, args)); } public OptionValidationException(Option option) { super( SeaTunnelAPIErrorCode.OPTION_VALIDATION_FAILED, String.format( "The option(\"%s\") is incorrectly configured, please refer to the doc: %s", option.key(), option.getDescription())); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/util/RequiredOption.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import lombok.Getter; import lombok.NonNull; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import static org.apache.seatunnel.api.configuration.util.OptionUtil.getOptionKeys; public interface RequiredOption { List> getOptions(); /** These options are mutually exclusive, allowing only one set of options to be configured. */ @Getter class ExclusiveRequiredOptions implements RequiredOption { private final List> exclusiveOptions; public ExclusiveRequiredOptions(@NonNull List> exclusiveOptions) { this.exclusiveOptions = exclusiveOptions; } public static ExclusiveRequiredOptions of(Option... options) { return new ExclusiveRequiredOptions(new ArrayList<>(Arrays.asList(options))); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof ExclusiveRequiredOptions)) { return false; } ExclusiveRequiredOptions that = (ExclusiveRequiredOptions) obj; return Objects.equals(this.exclusiveOptions, that.exclusiveOptions); } @Override public int hashCode() { return Objects.hash(exclusiveOptions); } @Override public String toString() { return String.format( "Exclusive required set options: %s", getOptionKeys(exclusiveOptions)); } @Override public List> getOptions() { return exclusiveOptions; } } /** The option is required. */ class AbsolutelyRequiredOptions implements RequiredOption { @Getter private final List> requiredOption; AbsolutelyRequiredOptions(List> requiredOption) { this.requiredOption = requiredOption; } public static AbsolutelyRequiredOptions of(Option... requiredOption) { return new AbsolutelyRequiredOptions(new ArrayList<>(Arrays.asList(requiredOption))); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof AbsolutelyRequiredOptions)) { return false; } AbsolutelyRequiredOptions that = (AbsolutelyRequiredOptions) obj; return Objects.equals(this.requiredOption, that.requiredOption); } @Override public int hashCode() { return this.requiredOption.hashCode(); } @Override public String toString() { return String.format( "Absolutely required options: '%s'", getOptionKeys(requiredOption)); } @Override public List> getOptions() { return requiredOption; } } class ConditionalRequiredOptions implements RequiredOption { private final Expression expression; private final List> requiredOption; ConditionalRequiredOptions(Expression expression, List> requiredOption) { this.expression = expression; this.requiredOption = requiredOption; } public static ConditionalRequiredOptions of( Expression expression, List> requiredOption) { return new ConditionalRequiredOptions(expression, requiredOption); } public static ConditionalRequiredOptions of( Condition condition, List> requiredOption) { return new ConditionalRequiredOptions(Expression.of(condition), requiredOption); } public Expression getExpression() { return expression; } public List> getRequiredOption() { return requiredOption; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof ConditionalRequiredOptions)) { return false; } ConditionalRequiredOptions that = (ConditionalRequiredOptions) obj; return Objects.equals(this.expression, that.expression) && Objects.equals(this.requiredOption, that.requiredOption); } @Override public int hashCode() { return this.requiredOption.hashCode(); } @Override public String toString() { return String.format( "Condition expression: %s, Required options: %s", expression, getOptionKeys(requiredOption)); } @Override public List> getOptions() { return requiredOption; } } /** These options are bundled, must be present or absent together. */ class BundledRequiredOptions implements RequiredOption { private final List> requiredOption; BundledRequiredOptions(List> requiredOption) { this.requiredOption = requiredOption; } public static BundledRequiredOptions of(Option... requiredOption) { return new BundledRequiredOptions(new ArrayList<>(Arrays.asList(requiredOption))); } public static BundledRequiredOptions of(List> requiredOption) { return new BundledRequiredOptions(requiredOption); } public List> getRequiredOption() { return requiredOption; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof BundledRequiredOptions)) { return false; } BundledRequiredOptions that = (BundledRequiredOptions) obj; return Objects.equals(this.requiredOption, that.requiredOption); } @Override public int hashCode() { return this.requiredOption.hashCode(); } @Override public String toString() { return String.format("Bundled Required options: %s", getOptionKeys(requiredOption)); } @Override public List> getOptions() { return requiredOption; } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/env/ParsingMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.env; import org.apache.seatunnel.api.table.catalog.Catalog; import org.apache.seatunnel.api.table.catalog.CatalogTable; /** * Multiple parsing modes for converting multi-{@link CatalogTable} retrieved through the {@link * Catalog} into DAG. */ public enum ParsingMode { /** * Each table is processed using a separate Source and Sink. * *
         * customer -> source(customer) -> sink(customer)
         * product  -> source(product)  -> sink(product)
         * stock    -> source(stock)    -> sink(stock)
         * 
    */ SINGLENESS, /** * Use a Source and Sink to process sharding-table. * *
         * customer1
         * customer2 --> customer\\d+ --> source(customer\\d+) -> sink(customer)
         * customer3
         * 
    */ SHARDING, /** * Multiple tables are processed using a single source, each table using a separate sink. * *
         * customer                   -> sink(customer)
         * product   --> source(.*)   -> sink(product)
         * stock                      -> sink(stock)
         * 
    */ @Deprecated MULTIPLEX } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/DefaultEventProcessor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import java.util.List; @Slf4j @AllArgsConstructor public class DefaultEventProcessor implements EventListener, EventProcessor { private final String jobId; private final List handlers; public DefaultEventProcessor() { this(DefaultEventProcessor.class.getClassLoader()); } public DefaultEventProcessor(String jobId) { this(jobId, EventProcessor.loadEventHandlers(DefaultEventProcessor.class.getClassLoader())); } public DefaultEventProcessor(ClassLoader classLoader) { this(null, EventProcessor.loadEventHandlers(classLoader)); } @Override public void process(Event event) { handlers.forEach(listener -> listener.handle(event)); } @Override public void onEvent(Event event) { if (jobId != null) { event.setJobId(jobId); } process(event); } @Override public void close() throws Exception { log.info("Closing event handlers."); EventProcessor.close(handlers); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/Event.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import java.io.Serializable; public interface Event extends Serializable { long getCreatedTime(); void setJobId(String jobId); String getJobId(); EventType getEventType(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/EventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import java.io.Serializable; public interface EventHandler extends Serializable, AutoCloseable { /** * Receive and handle the event data. * * @param event */ void handle(Event event); @Override default void close() throws Exception {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/EventListener.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import java.io.Serializable; public interface EventListener extends Serializable { void onEvent(Event event); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/EventProcessor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import java.util.LinkedList; import java.util.List; import java.util.ServiceConfigurationError; import java.util.ServiceLoader; public interface EventProcessor extends AutoCloseable { void process(Event event); static List loadEventHandlers(ClassLoader classLoader) { try { List result = new LinkedList<>(); ServiceLoader.load(EventHandler.class, classLoader) .iterator() .forEachRemaining(result::add); return result; } catch (ServiceConfigurationError e) { throw new RuntimeException("Could not load service provider for event handlers.", e); } } static void close(List handlers) throws Exception { if (handlers != null) { for (EventHandler handler : handlers) { handler.close(); } } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/EventType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; public enum EventType { SCHEMA_CHANGE_ADD_COLUMN, SCHEMA_CHANGE_DROP_COLUMN, SCHEMA_CHANGE_MODIFY_COLUMN, SCHEMA_CHANGE_CHANGE_COLUMN, SCHEMA_CHANGE_UPDATE_COLUMNS, SCHEMA_CHANGE_RENAME_TABLE, LIFECYCLE_ENUMERATOR_OPEN, LIFECYCLE_ENUMERATOR_CLOSE, LIFECYCLE_READER_OPEN, LIFECYCLE_READER_CLOSE, LIFECYCLE_WRITER_CLOSE, READER_MESSAGE_DELAYED, JOB_STATUS } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/LifecycleEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; public interface LifecycleEvent extends Event {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/event/LoggingEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.event; import com.google.auto.service.AutoService; import lombok.extern.slf4j.Slf4j; @Slf4j @AutoService(EventHandler.class) public class LoggingEventHandler implements EventHandler { @Override public void handle(Event event) { log.info("log event: {}", event); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/MetaLakeFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.api.metalake.gravitino.GravitinoClient; import org.apache.seatunnel.api.metalake.gravitino.GravitinoTableSchemaConvertor; import org.apache.seatunnel.common.constants.MetaLakeType; import java.util.HashMap; import java.util.Map; import java.util.function.Supplier; public class MetaLakeFactory { private static final Map> CLIENT_REGISTRY = new HashMap<>(); private static final Map> MAPPER_REGISTRY = new HashMap<>(); static { register(MetaLakeType.GRAVITINO.getType()); } private MetaLakeFactory() {} public static void register(String type) { CLIENT_REGISTRY.put(type.toLowerCase(), GravitinoClient::new); MAPPER_REGISTRY.put(type.toLowerCase(), GravitinoTableSchemaConvertor::new); } public static MetalakeClient createClient(MetaLakeType metaLakeType) { String type = metaLakeType.name().toLowerCase(); Supplier constructor = CLIENT_REGISTRY.get(type.toLowerCase()); if (constructor == null) { throw new IllegalArgumentException("Unknown MetalakeClient type: " + type); } return constructor.get(); } public static MetaLakeTableSchemaConvertor createTypeMapper(MetaLakeType metaLakeType) { String type = metaLakeType.name().toLowerCase(); Supplier constructor = MAPPER_REGISTRY.get(type.toLowerCase()); if (constructor == null) { throw new IllegalArgumentException("Unknown MetaLakeTypeMapper type: " + type); } return constructor.get(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/MetaLakeTableSchemaConvertor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.catalog.TableSchema; public interface MetaLakeTableSchemaConvertor { TableSchema convertor(JsonNode metaInfo); CatalogTable buildCatalogTable( String catalogName, TablePath tablePath, TableSchema tableSchema); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/MetalakeClient.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.api.table.catalog.TablePath; import java.io.IOException; public interface MetalakeClient extends AutoCloseable { JsonNode getMetaInfo(String sourceId, String metalakeUrl) throws IOException; JsonNode getTableSchema(String schemaHttpUrl) throws IOException; TablePath getTableSchemaPath(String schemaHttpUrl); @Override void close(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/MetalakeConfigUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigList; import org.apache.seatunnel.shade.com.typesafe.config.ConfigObject; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValue; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValueFactory; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValueType; import org.apache.seatunnel.api.options.EnvCommonOptions; import org.apache.seatunnel.common.Constants; import org.apache.seatunnel.common.constants.MetaLakeType; import org.apache.seatunnel.common.constants.PluginType; import org.apache.seatunnel.common.utils.PlaceholderUtils; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @Slf4j public class MetalakeConfigUtils { private static final String SOURCE_ID = "sourceId"; public static Config getMetalakeConfig(Config jobConfigTmp) { Config envConfig = jobConfigTmp.getConfig(Constants.ENV); boolean metalakeEnabled = envConfig.hasPath(EnvCommonOptions.METALAKE_ENABLED.key()) ? envConfig.getBoolean(EnvCommonOptions.METALAKE_ENABLED.key()) : Boolean.parseBoolean( System.getenv() .getOrDefault( EnvCommonOptions.METALAKE_ENABLED .key() .toUpperCase(), Boolean.toString(false))); if (!metalakeEnabled) return jobConfigTmp; Config update = jobConfigTmp; String metalakeType = envConfig.hasPath(EnvCommonOptions.METALAKE_TYPE.key()) ? envConfig.getString(EnvCommonOptions.METALAKE_TYPE.key()) : System.getenv(EnvCommonOptions.METALAKE_TYPE.key().toUpperCase()); String metalakeUrl = envConfig.hasPath(EnvCommonOptions.METALAKE_URL.key()) ? envConfig.getString(EnvCommonOptions.METALAKE_URL.key()) : System.getenv(EnvCommonOptions.METALAKE_URL.key().toUpperCase()); MetalakeClient metalakeClient = MetaLakeFactory.createClient(MetaLakeType.valueOf(metalakeType.toUpperCase())); update = replaceConfigList(update, PluginType.SOURCE.getType(), metalakeClient, metalakeUrl); update = replaceConfigList(update, PluginType.SINK.getType(), metalakeClient, metalakeUrl); update = replaceConfigList( update, PluginType.TRANSFORM.getType(), metalakeClient, metalakeUrl); return update; } private static Config replaceConfigList( Config updateConfig, String key, MetalakeClient metalakeClient, String metalakeUrl) { ConfigList list = updateConfig.getList(key); List newConfigList = new ArrayList<>(list); try { for (int i = 0; i < list.size(); i++) { ConfigObject Obj = (ConfigObject) list.get(i); if (Obj.containsKey(SOURCE_ID)) { ConfigObject tmp = Obj; String sourceId = Obj.toConfig().getString(SOURCE_ID); JsonNode metalakeJson = metalakeClient.getMetaInfo(sourceId, metalakeUrl); for (Map.Entry entry : Obj.entrySet()) { String subKey = entry.getKey(); ConfigValue value = entry.getValue(); if (value.valueType() == ConfigValueType.STRING) { String strValue = (String) value.unwrapped(); String newValue = PlaceholderUtils.replacePlaceholders(strValue, metalakeJson); tmp = tmp.withValue(subKey, ConfigValueFactory.fromAnyRef(newValue)); } } newConfigList.set(i, tmp); } } } catch (IOException e) { log.error("Fail to get MetaInfo", e); } return updateConfig.withValue(key, ConfigValueFactory.fromIterable(newConfigList)); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/TableSchemaDiscoverer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.options.EnvCommonOptions; import org.apache.seatunnel.api.options.table.CatalogOptions; import org.apache.seatunnel.api.options.table.ColumnOptions; import org.apache.seatunnel.api.options.table.FieldOptions; import org.apache.seatunnel.api.options.table.TableIdentifierOptions; import org.apache.seatunnel.api.options.table.TableSchemaOptions; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.factory.TableSourceFactoryContext; import org.apache.seatunnel.common.constants.MetaLakeType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.function.Predicate; import java.util.stream.Collectors; import static org.apache.seatunnel.api.table.schema.exception.SchemaEvolutionErrorCode.GET_META_LAKE_TABLE_SCHEMA_FAILED; @Slf4j public class TableSchemaDiscoverer implements AutoCloseable { private final ReadonlyConfig envOptions; private final ReadonlyConfig sourceOptions; private final String catalogName; private MetalakeClient metalakeClient; private final MetaLakeTableSchemaConvertor metaLakeTableSchemaConvertor; public TableSchemaDiscoverer(TableSourceFactoryContext context, String catalogName) { this.envOptions = context.getEnvOptions(); this.sourceOptions = context.getOptions(); this.catalogName = catalogName; if (enableMetaLakeClient(context.getOptions())) { this.metalakeClient = MetaLakeFactory.createClient(getMetaLakeType()); } this.metaLakeTableSchemaConvertor = MetaLakeFactory.createTypeMapper(getMetaLakeType()); } @VisibleForTesting protected TableSchemaDiscoverer( ReadonlyConfig envOptions, ReadonlyConfig sourceOptions, String catalogName, MetalakeClient metalakeClient, MetaLakeTableSchemaConvertor convertor) { this.envOptions = envOptions; this.sourceOptions = sourceOptions; this.catalogName = catalogName; this.metalakeClient = metalakeClient; this.metaLakeTableSchemaConvertor = convertor; } public List discoverTableSchemas() { // schema if (sourceOptions.getOptional(ConnectorCommonOptions.SCHEMA).isPresent()) { return Collections.singletonList(discoverTableSchema(sourceOptions)); } // table_config if (sourceOptions.getOptional(TableSchemaOptions.TABLE_CONFIGS).isPresent()) { return sourceOptions.get(TableSchemaOptions.TABLE_CONFIGS).stream() .map(ReadonlyConfig::fromMap) .map(this::discoverTableSchema) .collect(Collectors.toList()); } // table_list if (sourceOptions.getOptional(CatalogOptions.TABLE_LIST).isPresent()) { return sourceOptions.get(CatalogOptions.TABLE_LIST).stream() .map(ReadonlyConfig::fromMap) .map(this::discoverTableSchema) .collect(Collectors.toList()); } return Collections.singletonList(CatalogTableUtil.buildSimpleTextTable()); } private CatalogTable discoverTableSchema(ReadonlyConfig sourceOptions) { final Map schemaMap = sourceOptions.get(ConnectorCommonOptions.SCHEMA); ReadonlyConfig schemaConfig = ReadonlyConfig.fromMap(schemaMap); // fields or columns if (schemaConfig.getOptional(ColumnOptions.COLUMNS).isPresent() || sourceOptions.getOptional(FieldOptions.FIELDS).isPresent()) { return discoverTableSchemaFromConfig(sourceOptions); } // schema_url if (schemaConfig.getOptional(ColumnOptions.SCHEMA_URL).isPresent()) { return discoverTableSchemaFromMetaLake( schemaConfig.get(ColumnOptions.SCHEMA_URL), schemaConfig.get(TableIdentifierOptions.TABLE)); } return buildSimpleTextTable(schemaConfig); } private CatalogTable discoverTableSchemaFromConfig(ReadonlyConfig readonlyConfig) { return CatalogTableUtil.buildWithConfig(catalogName, readonlyConfig); } private CatalogTable discoverTableSchemaFromMetaLake(String schemaUrl, String configTablePath) { try { JsonNode schemaNode = metalakeClient.getTableSchema(schemaUrl); final TablePath tableSchemaPath; if (StringUtils.isNotEmpty(configTablePath)) { tableSchemaPath = TablePath.of(configTablePath); } else { tableSchemaPath = metalakeClient.getTableSchemaPath(schemaUrl); } final TableSchema tableSchema = metaLakeTableSchemaConvertor.convertor(schemaNode); return metaLakeTableSchemaConvertor.buildCatalogTable( catalogName, tableSchemaPath, tableSchema); } catch (IOException e) { String errorMsg = String.format( "Failed to get table schema from MetaLake. " + "Schema URL: %s, " + "Configured table path: %s, " + "Catalog name: %s, " + "Error: %s", schemaUrl, configTablePath != null ? configTablePath : "not configured", catalogName, e.getMessage()); throw new SeaTunnelRuntimeException( GET_META_LAKE_TABLE_SCHEMA_FAILED, new IOException(errorMsg, e)); } } private CatalogTable buildSimpleTextTable(ReadonlyConfig schemaConfig) { CatalogTable catalogTable = CatalogTableUtil.buildSimpleTextTable(); if (schemaConfig.getOptional(TableIdentifierOptions.TABLE).isPresent()) { String table = schemaConfig.get(TableIdentifierOptions.TABLE); return CatalogTable.of( TableIdentifier.of(catalogName, TablePath.of(table)), catalogTable); } return catalogTable; } @VisibleForTesting protected MetaLakeType getMetaLakeType() { // first source if (sourceOptions.getOptional(TableSchemaOptions.METALAKE_TYPE).isPresent()) { return sourceOptions.get(TableSchemaOptions.METALAKE_TYPE); } // second env if (envOptions != null) { if (envOptions.getOptional(EnvCommonOptions.METALAKE_TYPE).isPresent()) { return envOptions.get(EnvCommonOptions.METALAKE_TYPE); } } // third system if (StringUtils.isNotEmpty( System.getenv(EnvCommonOptions.METALAKE_TYPE.key().toUpperCase()))) { try { return MetaLakeType.valueOf( System.getenv(EnvCommonOptions.METALAKE_TYPE.key().toUpperCase())); } catch (Exception e) { log.warn( "The environment variable configuration is incorrect and automatically downgraded to GRAVITINO.", e); return MetaLakeType.GRAVITINO; } } // default return MetaLakeType.GRAVITINO; } @VisibleForTesting protected boolean enableMetaLakeClient(ReadonlyConfig sourceOptions) { // schema if (sourceOptions.getOptional(ConnectorCommonOptions.SCHEMA).isPresent()) { final Map schemaMap = sourceOptions.get(ConnectorCommonOptions.SCHEMA); ReadonlyConfig schemaConfig = ReadonlyConfig.fromMap(schemaMap); if (schemaConfig.getOptional(ColumnOptions.SCHEMA_URL).isPresent()) { return true; } } // table_config if (sourceOptions.getOptional(TableSchemaOptions.TABLE_CONFIGS).isPresent()) { return sourceOptions.get(TableSchemaOptions.TABLE_CONFIGS).stream() .map(ReadonlyConfig::fromMap) .anyMatch(this.getEnableMetaLakeClientPredicate()); } // table_list if (sourceOptions.getOptional(CatalogOptions.TABLE_LIST).isPresent()) { return sourceOptions.get(CatalogOptions.TABLE_LIST).stream() .map(ReadonlyConfig::fromMap) .anyMatch(this.getEnableMetaLakeClientPredicate()); } return false; } private Predicate getEnableMetaLakeClientPredicate() { return config -> { if (config.getOptional(ConnectorCommonOptions.SCHEMA).isPresent()) { final Map schemaMap = config.get(ConnectorCommonOptions.SCHEMA); ReadonlyConfig schemaConfig = ReadonlyConfig.fromMap(schemaMap); return schemaConfig.getOptional(ColumnOptions.SCHEMA_URL).isPresent(); } return false; }; } /** Close the metalake client and release resources. */ @Override public void close() { if (metalakeClient != null) { metalakeClient.close(); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/gravitino/GravitinoClient.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake.gravitino; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; import org.apache.seatunnel.api.metalake.MetalakeClient; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import org.apache.seatunnel.common.utils.JsonUtils; import org.apache.seatunnel.common.utils.SeaTunnelException; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import static org.apache.seatunnel.api.table.schema.exception.SchemaEvolutionErrorCode.ERROR_INVALID_TABLE_URL; @Slf4j public class GravitinoClient implements MetalakeClient { private static final String HEADER_ACCEPT = "Accept"; private static final String MEDIA_TYPE_GRAVITINO_V1 = "application/vnd.gravitino.v1+json"; private static final String JSON_FIELD_CATALOG = "catalog"; private static final String JSON_FIELD_TABLE = "table"; private static final String JSON_FIELD_PROPERTIES = "properties"; private static final String ERROR_NO_RESPONSE_ENTITY = "No response entity"; private static final String ERROR_MISSING_FIELD_TEMPLATE = "Response JSON has no '%s' field"; private static final int MAX_RETRY_ATTEMPTS = 3; private static final long RETRY_DELAY_MS = 2000; private static final Pattern TABLE_URL_PATTERN = Pattern.compile("/catalogs/([^/]+)/schemas/([^/]+)/tables/([^/]+)"); private final CloseableHttpClient httpClient; public GravitinoClient() { this.httpClient = HttpClients.createDefault(); } @VisibleForTesting protected GravitinoClient(CloseableHttpClient httpClient) { this.httpClient = httpClient; } @Override public JsonNode getMetaInfo(String sourceId, String metalakeUrl) throws IOException { JsonNode rootNode = executeGetRequest(metalakeUrl + sourceId); JsonNode catalogNode = getRequiredNode(rootNode, JSON_FIELD_CATALOG); return getRequiredNode(catalogNode, JSON_FIELD_PROPERTIES); } @Override public JsonNode getTableSchema(String schemaHttpUrl) throws IOException { JsonNode rootNode = executeGetRequest(schemaHttpUrl); return getRequiredNode(rootNode, JSON_FIELD_TABLE); } @Override public TablePath getTableSchemaPath(String schemaHttpUrl) { if (schemaHttpUrl == null || schemaHttpUrl.isEmpty()) { throw new SeaTunnelRuntimeException( ERROR_INVALID_TABLE_URL, "Table URL cannot be null or empty"); } final Matcher matcher = getMatcher(schemaHttpUrl); String catalogName = matcher.group(1); String schemaName = matcher.group(2); String tableName = matcher.group(3); return TablePath.of(catalogName, schemaName, tableName); } private Matcher getMatcher(String schemaHttpUrl) { Matcher matcher = TABLE_URL_PATTERN.matcher(schemaHttpUrl); if (!matcher.find()) { throw new SeaTunnelRuntimeException( ERROR_INVALID_TABLE_URL, String.format( "Invalid table URL format: '%s'. " + "Expected format: http://host/api/metalakes/{metalake}/catalogs/{catalog}/schemas/{schema}/tables/{table}", schemaHttpUrl)); } return matcher; } /** * Execute HTTP GET request and return parsed JSON response. Implements retry with exponential * backoff for transient failures. * * @param url the request URL * @return parsed JSON root node */ private JsonNode executeGetRequest(String url) { for (int attempt = 1; attempt <= MAX_RETRY_ATTEMPTS; attempt++) { HttpGet request = new HttpGet(url); request.addHeader(HEADER_ACCEPT, MEDIA_TYPE_GRAVITINO_V1); try (CloseableHttpResponse response = httpClient.execute(request)) { final int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (!isRetryableHttpStatus(statusCode)) { throw new SeaTunnelException( String.format( "Failed to execute HTTP request to %s , http status code is %s", url, statusCode)); } else { sleepQuietly(RETRY_DELAY_MS); } } else { HttpEntity entity = response.getEntity(); if (entity == null) { throw new RuntimeException(ERROR_NO_RESPONSE_ENTITY); } try { return JsonUtils.readTree(entity.getContent()); } finally { EntityUtils.consume(entity); } } } catch (IOException e) { if (attempt >= MAX_RETRY_ATTEMPTS) { break; } // Exponential backoff delay before retry long delayMs = RETRY_DELAY_MS; log.debug( "HTTP request to {} failed on attempt {}/{}, retrying in {}ms: {}", url, attempt, MAX_RETRY_ATTEMPTS, delayMs, e.getMessage()); sleepQuietly(delayMs); } } throw new SeaTunnelException( String.format( "Failed to execute HTTP request to %s after %d attempts", url, MAX_RETRY_ATTEMPTS)); } /** 5xx and 408 and 429 will be retried */ private boolean isRetryableHttpStatus(int httpStatus) { return httpStatus == HttpStatus.SC_INTERNAL_SERVER_ERROR || httpStatus == HttpStatus.SC_NOT_IMPLEMENTED || httpStatus == HttpStatus.SC_BAD_GATEWAY || httpStatus == HttpStatus.SC_SERVICE_UNAVAILABLE || httpStatus == HttpStatus.SC_GATEWAY_TIMEOUT || httpStatus == HttpStatus.SC_HTTP_VERSION_NOT_SUPPORTED || httpStatus == HttpStatus.SC_INSUFFICIENT_STORAGE || httpStatus == HttpStatus.SC_REQUEST_TIMEOUT || httpStatus == HttpStatus.SC_TOO_MANY_REQUESTS; } /** * Sleep without throwing InterruptedException. If interrupted, the thread's interrupt status * will be restored. * * @param millis sleep duration in milliseconds */ private void sleepQuietly(long millis) { try { Thread.sleep(millis); } catch (InterruptedException e) { Thread.currentThread().interrupt(); log.debug("Sleep interrupted during retry backoff", e); } } /** * Get a required child node from parent node, throw exception if not found. * * @param parentNode the parent JSON node * @param fieldName the field name to retrieve * @return the child node * @throws RuntimeException if the field is not present */ private JsonNode getRequiredNode(JsonNode parentNode, String fieldName) { JsonNode node = parentNode.get(fieldName); if (node == null) { throw new RuntimeException(String.format(ERROR_MISSING_FIELD_TEMPLATE, fieldName)); } return node; } /** Close the HTTP client and release resources. Safe to call multiple times. */ @Override public void close() { if (httpClient != null) { try { httpClient.close(); } catch (IOException e) { // Ignore close exception as HttpClient is being shut down anyway log.debug("Failed to close HTTP client, ignoring", e); } } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/metalake/gravitino/GravitinoTableSchemaConvertor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake.gravitino; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.api.metalake.MetaLakeTableSchemaConvertor; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.DecimalType; import org.apache.seatunnel.api.table.type.LocalTimeType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.PrimitiveByteArrayType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.common.constants.MetaLakeType; import org.apache.seatunnel.common.exception.CommonError; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Converter for transforming Gravitino table metadata into SeaTunnel CatalogTable format. * *

    Reference documentation: * *

    */ public class GravitinoTableSchemaConvertor implements MetaLakeTableSchemaConvertor { private static final Pattern DECIMAL_PATTERN = Pattern.compile( "decimal\\s*\\(\\s*(\\d+)\\s*,\\s*(\\d+)\\s*\\)", Pattern.CASE_INSENSITIVE); private static final Pattern VARCHAR_PATTERN = Pattern.compile("varchar\\s*\\(\\s*(\\d+)\\s*\\)", Pattern.CASE_INSENSITIVE); private static final Pattern CHAR_PATTERN = Pattern.compile("char\\s*\\(\\s*(\\d+)\\s*\\)", Pattern.CASE_INSENSITIVE); private static final Pattern FIXED_PATTERN = Pattern.compile("fixed\\s*\\(\\s*(\\d+)\\s*\\)", Pattern.CASE_INSENSITIVE); private static final Pattern TIMESTAMP_PATTERN = Pattern.compile("timestamp(_tz)?\\s*\\(\\s*(\\d+)\\s*\\)", Pattern.CASE_INSENSITIVE); // JSON field names private static final String COLUMNS = "columns"; private static final String INDEXES = "indexes"; private static final String NAME = "name"; private static final String TYPE = "type"; private static final String NULLABLE = "nullable"; private static final String INDEX_TYPE = "indexType"; private static final String FIELD_NAMES = "fieldNames"; // Complex type field names private static final String ELEMENT_TYPE = "elementType"; private static final String KEY_TYPE = "keyType"; private static final String VALUE_TYPE = "valueType"; private static final String FIELDS = "fields"; // index type private static final String PRIMARY_KEY = "PRIMARY_KEY"; private static final String UNIQUE_KEY = "UNIQUE_KEY"; @Override public TableSchema convertor(JsonNode metaInfo) { List columns = new ArrayList<>(); PrimaryKey primaryKey = null; List constraintKeys = new ArrayList<>(); // Parse columns JsonNode columnsNode = metaInfo.get(COLUMNS); if (columnsNode != null && columnsNode.isArray()) { if (columnsNode.isEmpty()) { throw CommonError.illegalArgument( "columns", "GravitinoTableSchemaConvertor.convertor"); } for (JsonNode columnNode : columnsNode) { columns.add(parseColumn(columnNode)); } } // Parse indexes JsonNode indexesNode = metaInfo.get(INDEXES); if (indexesNode != null && indexesNode.isArray()) { for (JsonNode indexNode : indexesNode) { String indexType = getTextValue(indexNode, INDEX_TYPE); if (PRIMARY_KEY.equalsIgnoreCase(indexType)) { primaryKey = parsePrimaryKey(indexNode); } else if (UNIQUE_KEY.equalsIgnoreCase(indexType)) { constraintKeys.add(parseUniqueKey(indexNode)); } } } // Build table schema TableSchema.Builder schemaBuilder = TableSchema.builder().columns(columns); if (primaryKey != null) { schemaBuilder.primaryKey(primaryKey); } if (!constraintKeys.isEmpty()) { schemaBuilder.constraintKey(constraintKeys); } return schemaBuilder.build(); } @Override public CatalogTable buildCatalogTable( String catalogName, TablePath tablePath, TableSchema tableSchema) { TableIdentifier tableIdentifier = TableIdentifier.of(catalogName, tablePath); // Build catalog table return CatalogTable.of( tableIdentifier, tableSchema, new HashMap<>(), new ArrayList<>(), null, catalogName); } /** Parse a column node from Gravitino JSON. */ private Column parseColumn(JsonNode columnNode) { String name = getTextValue(columnNode, NAME); boolean nullable = columnNode.has(NULLABLE) && columnNode.get(NULLABLE).asBoolean(); JsonNode typeNode = columnNode.get(TYPE); if (typeNode == null) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "null", name); } SeaTunnelDataType dataType = convertGravitinoType(name, typeNode); // Extract column length and scale from type string // Returns null if the type doesn't support length/scale specification Long columnLength = null; Integer scale = null; if (typeNode.isTextual()) { Pair result = extractLengthAndScale(typeNode.asText()); if (result != null) { columnLength = result.getLeft(); scale = result.getRight(); } } return PhysicalColumn.builder() .name(name) .dataType(dataType) .columnLength(columnLength) .scale(scale) .nullable(nullable) .build(); } /** * Convert Gravitino type to SeaTunnel DataType. * * @param fieldName the field name for error reporting * @param typeNode the JSON node representing the type (string or object) * @return the corresponding SeaTunnel data type */ private SeaTunnelDataType convertGravitinoType(String fieldName, JsonNode typeNode) { if (typeNode.isObject()) { // Handle complex type (JSON object): list, map, struct, external, etc. return convertComplexType(fieldName, typeNode); } else if (typeNode.isTextual()) { // Handle simple type (string): boolean, int, string, etc. return convertSimpleType(fieldName, typeNode); } else { // Invalid type: neither Object nor Textual throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), typeNode.toString(), fieldName); } } /** Convert complex type (JSON object with type field). */ private SeaTunnelDataType convertComplexType(String fieldName, JsonNode typeNode) { JsonNode typeField = typeNode.get(TYPE); if (typeField == null || !typeField.isTextual()) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), typeNode.toString(), fieldName); } String type = typeField.asText().toLowerCase(); switch (type) { case "list": JsonNode elementType = typeNode.get(ELEMENT_TYPE); if (elementType == null) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "list without elementType", fieldName); } return ArrayType.of(convertGravitinoType(fieldName, elementType)); case "map": JsonNode keyType = typeNode.get(KEY_TYPE); JsonNode valueType = typeNode.get(VALUE_TYPE); if (keyType == null || valueType == null) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "map without keyType or valueType", fieldName); } return new MapType<>( convertGravitinoType(fieldName, keyType), convertGravitinoType(fieldName, valueType)); case "struct": JsonNode fields = typeNode.get(FIELDS); if (fields == null || !fields.isArray()) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "struct without fields array", fieldName); } List fieldNames = new ArrayList<>(); List> fieldTypes = new ArrayList<>(); for (JsonNode field : fields) { String fName = getTextValue(field, NAME); if (fName == null) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "struct field without name", fieldName); } JsonNode fType = field.get(TYPE); if (fType == null) { throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), "struct field '" + fName + "' without type", fieldName); } fieldNames.add(fName); fieldTypes.add(convertGravitinoType(fieldName + "." + fName, fType)); } return new SeaTunnelRowType( fieldNames.toArray(new String[0]), fieldTypes.toArray(new SeaTunnelDataType[0])); case "external": // External types like PostgreSQL jsonb are treated as string return BasicType.STRING_TYPE; case "union": default: throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), type, fieldName); } } /** Convert simple type (string like "boolean", "integer", "decimal(10,2)", etc.). */ private SeaTunnelDataType convertSimpleType(String fieldName, JsonNode typeNode) { String gravitinoType = typeNode.asText(); String normalizedType = gravitinoType.trim().toLowerCase(); // Remove parameters for simple type matching String baseType = normalizedType.split("\\(")[0].trim(); // Handle decimal type: decimal(precision, scale) - only match regex for decimal type if ("decimal".equals(baseType)) { Matcher decimalMatcher = DECIMAL_PATTERN.matcher(gravitinoType); if (decimalMatcher.find()) { int precision = Integer.parseInt(decimalMatcher.group(1)); int scale = Integer.parseInt(decimalMatcher.group(2)); return new DecimalType(precision, scale); } // decimal without parameters or invalid format, throw error throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), gravitinoType, fieldName); } // Remove 'unsigned' suffix to simplify type matching String cleanType = baseType.replaceAll("unsigned", "").trim(); switch (cleanType) { case "boolean": return BasicType.BOOLEAN_TYPE; case "byte": return BasicType.BYTE_TYPE; case "short": return BasicType.SHORT_TYPE; case "integer": return BasicType.INT_TYPE; case "long": return BasicType.LONG_TYPE; case "float": return BasicType.FLOAT_TYPE; case "double": return BasicType.DOUBLE_TYPE; case "string": case "varchar": case "char": case "uuid": case "interval_year": case "interval_day": return BasicType.STRING_TYPE; case "date": return LocalTimeType.LOCAL_DATE_TYPE; case "time": return LocalTimeType.LOCAL_TIME_TYPE; case "timestamp": return LocalTimeType.LOCAL_DATE_TIME_TYPE; case "timestamp_tz": return LocalTimeType.OFFSET_DATE_TIME_TYPE; case "binary": case "fixed": return PrimitiveByteArrayType.INSTANCE; default: throw CommonError.convertToSeaTunnelTypeError( MetaLakeType.GRAVITINO.getType(), baseType, fieldName); } } /** * Extract column length and scale from type string. * *

    Supports extracting: * *

      *
    • Length: varchar(n), char(n), fixed(n), timestamp(n), timestamp_tz(n), time(n) *
    • Scale: decimal(precision,scale) - returns scale, precision can be obtained via * DecimalType *
    * * @param type the type string (e.g., "varchar(255)", "decimal(10,2)", "timestamp(6)") * @return a Pair where left is length (Long) and right is scale (Integer), or null if neither * exists */ private Pair extractLengthAndScale(String type) { // Extract base type before the parenthesis String baseType = type.split("\\(")[0].trim().toLowerCase(); // Remove 'unsigned' suffix for type matching String cleanType = baseType.replaceAll("unsigned", "").trim(); switch (cleanType) { case "decimal": Matcher decimalMatcher = DECIMAL_PATTERN.matcher(type); if (decimalMatcher.find()) { return Pair.of( Long.parseLong(decimalMatcher.group(1)), Integer.parseInt(decimalMatcher.group(2))); } break; case "varchar": Matcher varcharMatcher = VARCHAR_PATTERN.matcher(type); if (varcharMatcher.find()) { return Pair.of(Long.parseLong(varcharMatcher.group(1)), null); } break; case "char": Matcher charMatcher = CHAR_PATTERN.matcher(type); if (charMatcher.find()) { return Pair.of(Long.parseLong(charMatcher.group(1)), null); } break; case "fixed": Matcher fixedMatcher = FIXED_PATTERN.matcher(type); if (fixedMatcher.find()) { return Pair.of(Long.parseLong(fixedMatcher.group(1)), null); } break; case "timestamp": case "timestamp_tz": Matcher timestampMatcher = TIMESTAMP_PATTERN.matcher(type); if (timestampMatcher.find()) { return Pair.of(Long.parseLong(timestampMatcher.group(2)), null); } break; default: // Types not supporting length/scale parameters break; } return null; } /** Parse primary key from index node. */ private PrimaryKey parsePrimaryKey(JsonNode indexNode) { String indexName = getTextValue(indexNode, NAME); List columnNames = new ArrayList<>(); JsonNode fieldNamesNode = indexNode.get(FIELD_NAMES); if (fieldNamesNode != null && fieldNamesNode.isArray()) { for (JsonNode fieldNameArray : fieldNamesNode) { if (fieldNameArray.isArray() && !fieldNameArray.isEmpty()) { columnNames.add(fieldNameArray.get(0).asText()); } } } return PrimaryKey.of(indexName, columnNames); } /** Parse unique key from index node. */ private ConstraintKey parseUniqueKey(JsonNode indexNode) { String indexName = getTextValue(indexNode, NAME); List columns = new ArrayList<>(); JsonNode fieldNamesNode = indexNode.get(FIELD_NAMES); if (fieldNamesNode != null && fieldNamesNode.isArray()) { for (JsonNode fieldNameArray : fieldNamesNode) { if (fieldNameArray.isArray() && !fieldNameArray.isEmpty()) { String columnName = fieldNameArray.get(0).asText(); columns.add( ConstraintKey.ConstraintKeyColumn.of( columnName, ConstraintKey.ColumnSortType.ASC)); } } } return ConstraintKey.of(ConstraintKey.ConstraintType.UNIQUE_KEY, indexName, columns); } /** Get text value from JSON node field. */ private String getTextValue(JsonNode node, String fieldName) { JsonNode fieldNode = node.get(fieldName); return fieldNode != null ? fieldNode.asText() : null; } /** Simple immutable pair class to avoid coupling with scala.Tuple2 or Apache Commons Pair. */ private static class Pair { private final L left; private final R right; private Pair(L left, R right) { this.left = left; this.right = right; } public static Pair of(L left, R right) { return new Pair<>(left, right); } public L getLeft() { return left; } public R getRight() { return right; } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/ConnectorCommonOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.options.table.CatalogOptions; import org.apache.seatunnel.api.options.table.ColumnOptions; import org.apache.seatunnel.api.options.table.ConstraintKeyOptions; import org.apache.seatunnel.api.options.table.FieldOptions; import org.apache.seatunnel.api.options.table.FormatOptions; import org.apache.seatunnel.api.options.table.PrimaryKeyOptions; import org.apache.seatunnel.api.options.table.TableIdentifierOptions; import org.apache.seatunnel.api.options.table.TableSchemaOptions; import java.io.Serializable; import java.util.List; public class ConnectorCommonOptions implements CatalogOptions, TableSchemaOptions, TableIdentifierOptions, FieldOptions, ColumnOptions, PrimaryKeyOptions, ConstraintKeyOptions, FormatOptions, Serializable { public static Option PLUGIN_NAME = Options.key("plugin_name") .stringType() .noDefaultValue() .withDescription("Name of the SPI plugin class."); public static Option PLUGIN_OUTPUT = Options.key("plugin_output") .stringType() .noDefaultValue() .withFallbackKeys("result_table_name") .withDescription( "When plugin_output is not specified, " + "the data processed by this plugin will not be registered as a data set (dataStream/dataset) " + "that can be directly accessed by other plugins, or called a temporary table (table)" + "When plugin_output is specified, " + "the data processed by this plugin will be registered as a data set (dataStream/dataset) " + "that can be directly accessed by other plugins, or called a temporary table (table) . " + "The data set (dataStream/dataset) registered here can be directly accessed by other plugins " + "by specifying plugin_input ."); public static Option> PLUGIN_INPUT = Options.key("plugin_input") .listType() .noDefaultValue() .withFallbackKeys("source_table_name") .withDescription( "When plugin_input is not specified, " + "the current plug-in processes the data set dataset output by the previous plugin in the configuration file. " + "When plugin_input is specified, the current plug-in is processing the data set corresponding to this parameter."); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/EnvCommonOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.sink.SaveModeExecuteLocation; import org.apache.seatunnel.common.constants.JobMode; import org.apache.seatunnel.common.constants.MetaLakeType; import java.util.Map; public class EnvCommonOptions { public static Option PARALLELISM = Options.key("parallelism") .intType() .defaultValue(1) .withDescription( "When parallelism is not specified in connector, the parallelism in env is used by default. " + "When parallelism is specified, it will override the parallelism in env."); public static Option JOB_NAME = Options.key("job.name") .stringType() .defaultValue("SeaTunnel_Job") .withDescription("The job name of this job"); public static Option JOB_MODE = Options.key("job.mode") .enumType(JobMode.class) .defaultValue(JobMode.BATCH) .withDescription("The job mode of this job, support Batch and Stream"); public static Option JOB_RETRY_TIMES = Options.key("job.retry.times") .intType() .defaultValue(3) .withDescription("The retry times of this job"); public static Option JOB_RETRY_INTERVAL_SECONDS = Options.key("job.retry.interval.seconds") .intType() .defaultValue(3) .withDescription("The retry interval seconds of this job"); public static Option CHECKPOINT_INTERVAL = Options.key("checkpoint.interval") .longType() .noDefaultValue() .withDescription( "The interval (in milliseconds) between two consecutive checkpoints."); public static Option READ_LIMIT_ROW_PER_SECOND = Options.key("read_limit.rows_per_second") .intType() .noDefaultValue() .withDescription( "The each parallelism row limit per second for read data from source."); public static Option READ_LIMIT_BYTES_PER_SECOND = Options.key("read_limit.bytes_per_second") .intType() .noDefaultValue() .withDescription( "The each parallelism bytes limit per second for read data from source."); public static Option CHECKPOINT_TIMEOUT = Options.key("checkpoint.timeout") .longType() .noDefaultValue() .withDescription("The timeout (in milliseconds) for a checkpoint."); public static Option CHECKPOINT_MIN_PAUSE = Options.key("min-pause") .intType() .defaultValue(-1) .withDescription( "The minimum pause (in milliseconds) between consecutive checkpoints. " + "This ensures that checkpoints are not triggered too frequently and provides."); public static Option SAVEMODE_EXECUTE_LOCATION = Options.key("savemode.execute.location") .enumType(SaveModeExecuteLocation.class) .defaultValue(SaveModeExecuteLocation.CLUSTER) .withDescription("The location of save mode execute."); public static Option JARS = Options.key("jars") .stringType() .noDefaultValue() .withDescription("third-party packages can be loaded via `jars`"); public static Option> CUSTOM_PARAMETERS = Options.key("custom_parameters") .mapType() .noDefaultValue() .withDescription("custom parameters for run engine"); public static Option> NODE_TAG_FILTER = Options.key("tag_filter") .mapType() .noDefaultValue() .withDescription("Define the worker where the job runs by tag"); public static Option METALAKE_ENABLED = Options.key("metalake_enabled") .booleanType() .defaultValue(false) .withDescription("Turn on metadata lake"); public static Option METALAKE_TYPE = Options.key("metalake_type") .enumType(MetaLakeType.class) .defaultValue(MetaLakeType.GRAVITINO) .withDescription("Metadata lake type, for example: gravitino"); public static Option METALAKE_URL = Options.key("metalake_url") .stringType() .noDefaultValue() .withDescription( "The http path of the metadata lake, for example: http://localhost:8090/api/metalakes/laowang_test/catalogs/"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/EnvOptionRule.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options; import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.table.factory.Factory; import com.google.auto.service.AutoService; @AutoService(Factory.class) public class EnvOptionRule implements Factory { @Override public String factoryIdentifier() { return "EnvOptionRule"; } @Override public OptionRule optionRule() { return OptionRule.builder() .required(EnvCommonOptions.JOB_MODE) .optional( EnvCommonOptions.JOB_NAME, EnvCommonOptions.PARALLELISM, EnvCommonOptions.JOB_RETRY_TIMES, EnvCommonOptions.JOB_RETRY_INTERVAL_SECONDS, EnvCommonOptions.JARS, EnvCommonOptions.CHECKPOINT_INTERVAL, EnvCommonOptions.CHECKPOINT_TIMEOUT, EnvCommonOptions.CHECKPOINT_MIN_PAUSE, EnvCommonOptions.READ_LIMIT_ROW_PER_SECOND, EnvCommonOptions.READ_LIMIT_BYTES_PER_SECOND, EnvCommonOptions.SAVEMODE_EXECUTE_LOCATION, EnvCommonOptions.CUSTOM_PARAMETERS, EnvCommonOptions.NODE_TAG_FILTER) .build(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/SinkConnectorCommonOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options; import org.apache.seatunnel.api.annotation.Experimental; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; public class SinkConnectorCommonOptions extends ConnectorCommonOptions { @Experimental public static Option MULTI_TABLE_SINK_REPLICA = Options.key("multi_table_sink_replica") .intType() .defaultValue(1) .withDescription("The replica number of multi table sink writer"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/SourceConnectorCommonOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.env.ParsingMode; public class SourceConnectorCommonOptions extends ConnectorCommonOptions { public static Option DAG_PARSING_MODE = Options.key("dag-parsing.mode") .enumType(ParsingMode.class) .defaultValue(ParsingMode.SINGLENESS) .withDescription("Whether to enable parsing support for multi-table source"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/CatalogOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import java.util.List; import java.util.Map; public interface CatalogOptions { @Deprecated Option> CATALOG_OPTIONS = Options.key("catalog") .mapType() .noDefaultValue() .withDescription("configuration options for the catalog."); Option CATALOG_NAME = Options.key("name").stringType().noDefaultValue().withDescription("catalog name"); Option> TABLE_NAMES = Options.key("table-names") .listType() .noDefaultValue() .withDescription( "List of table names of databases to capture." + "The table name needs to include the database name, for example: database_name.table_name"); Option DATABASE_PATTERN = Options.key("database-pattern") .stringType() .defaultValue(".*") .withDescription("The database names RegEx of the database to capture."); Option TABLE_PATTERN = Options.key("table-pattern") .stringType() .noDefaultValue() .withDescription( "The table names RegEx of the database to capture." + "The table name needs to include the database name, for example: database_.*\\.table_.*"); /** * This parameter is deprecated, please use parameter: TableSchemaOptions.TABLE_CONFIGS. {@link * org.apache.seatunnel.api.options.table.TableSchemaOptions} */ @Deprecated Option>> TABLE_LIST = Options.key("table_list") .type(new TypeReference>>() {}) .noDefaultValue() .withDescription( "This parameter is deprecated, please use parameter: TableSchemaOptions.TABLE_CONFIGS. SeaTunnel Multi Table Schema, acts on structured and unstructured data sources. " + "such as jdbc, paimon, doris, etc"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/ColumnOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import java.util.List; import java.util.Map; public interface ColumnOptions { // todo: how to define List> Option>> COLUMNS = Options.key("columns") .type(new TypeReference>>() {}) .noDefaultValue() .withDescription("SeaTunnel Schema Columns"); Option COLUMN_NAME = Options.key("name") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Column Name"); Option TYPE = Options.key("type") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Column Type"); Option COLUMN_SCALE = Options.key("columnScale") .intType() .noDefaultValue() .withDescription("SeaTunnel Schema Column scale"); Option COLUMN_LENGTH = Options.key("columnLength") .longType() .defaultValue(0L) .withDescription("SeaTunnel Schema Column Length"); Option NULLABLE = Options.key("nullable") .booleanType() .defaultValue(true) .withDescription("SeaTunnel Schema Column Nullable"); Option DEFAULT_VALUE = Options.key("defaultValue") .objectType(Object.class) .noDefaultValue() .withDescription("SeaTunnel Schema Column Default Value"); Option COLUMN_COMMENT = Options.key("comment") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Column Comment"); Option SCHEMA_URL = Options.key("schema_url") .stringType() .noDefaultValue() .withDescription( "The http path of the schema, for example: http://localhost:8090/api/metalakes/laowang_test/catalogs/221-pgsql/schemas/ykw/tables/all_type"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/ConstraintKeyOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import java.util.List; import java.util.Map; public interface ConstraintKeyOptions { Option>> CONSTRAINT_KEYS = Options.key("constraintKeys") .type(new TypeReference>>() {}) .noDefaultValue() .withDescription( "SeaTunnel Schema Constraint Keys. e.g. [{name: \"xx_index\", type: \"KEY\", columnKeys: [{columnName: \"name\", sortType: \"ASC\"}]}]"); Option CONSTRAINT_KEY_NAME = Options.key("constraintName") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Constraint Key Name"); Option CONSTRAINT_KEY_TYPE = Options.key("constraintType") .enumType(ConstraintKey.ConstraintType.class) .noDefaultValue() .withDescription( "SeaTunnel Schema Constraint Key Type, e.g. KEY, UNIQUE_KEY, FOREIGN_KEY"); Option>> CONSTRAINT_KEY_COLUMNS = Options.key("constraintColumns") .type(new TypeReference>>() {}) .noDefaultValue() .withDescription( "SeaTunnel Schema Constraint Key Columns. e.g. [{columnName: \"name\", sortType: \"ASC\"}]"); Option CONSTRAINT_KEY_COLUMN_NAME = Options.key("columnName") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Constraint Key Column Name"); Option CONSTRAINT_KEY_COLUMN_SORT_TYPE = Options.key("sortType") .enumType(ConstraintKey.ColumnSortType.class) .defaultValue(ConstraintKey.ColumnSortType.ASC) .withDescription( "SeaTunnel Schema Constraint Key Column Sort Type, e.g. ASC, DESC"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/FieldOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import java.util.Map; // We should use ColumnOptions instead of FieldOptions @Deprecated public interface FieldOptions { Option> FIELDS = Options.key("schema.fields") .type(new TypeReference>() {}) .noDefaultValue() .withDescription("SeaTunnel Schema Fields"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/FormatOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.common.utils.DateTimeUtils; import org.apache.seatunnel.common.utils.DateUtils; import org.apache.seatunnel.common.utils.TimeUtils; public interface FormatOptions { Option DATE_FORMAT_LEGACY = Options.key("date_format") .enumType(DateUtils.Formatter.class) .defaultValue(DateUtils.Formatter.YYYY_MM_DD) .withDescription("Date format"); Option DATETIME_FORMAT_LEGACY = Options.key("datetime_format") .enumType(DateTimeUtils.Formatter.class) .defaultValue(DateTimeUtils.Formatter.YYYY_MM_DD_HH_MM_SS) .withDescription("Datetime format"); Option TIME_FORMAT_LEGACY = Options.key("time_format") .enumType(TimeUtils.Formatter.class) .defaultValue(TimeUtils.Formatter.HH_MM_SS) .withDescription("Time format"); // Not used yet. Reserved for future use to support custom date/time format strings. Option DATE_FORMAT = Options.key("date_format") .stringType() .defaultValue("yyyy-MM-dd") .withDescription( "Date format string (e.g. 'yyyy-MM-dd'). " + "Must match one of the predefined values in the Formatter enum."); Option DATETIME_FORMAT = Options.key("datetime_format") .stringType() .defaultValue("yyyy-MM-dd HH:mm:ss") .withDescription( "Datetime format string (e.g. 'yyyy-MM-dd HH:mm:ss'). " + "Must match one of the predefined values in the Formatter enum."); // Not used yet. Reserved for future use to support custom date/time format strings. Option TIME_FORMAT = Options.key("time_format") .stringType() .defaultValue("HH:mm:ss") .withDescription( "Time format string (e.g. 'HH:mm:ss'). " + "Must match one of the predefined values in the Formatter enum."); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/PrimaryKeyOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import java.util.List; import java.util.Map; public interface PrimaryKeyOptions { Option> PRIMARY_KEY = Options.key("primaryKey") .type(new TypeReference>() {}) .noDefaultValue() .withDescription("SeaTunnel Schema Fields"); Option PRIMARY_KEY_NAME = Options.key("name") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Primary Key Name"); Option> PRIMARY_KEY_COLUMNS = Options.key("columnNames") .listType() .noDefaultValue() .withDescription("SeaTunnel Schema Primary Key Columns"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/TableIdentifierOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import java.util.List; public interface TableIdentifierOptions { Option SCHEMA_FIRST = Options.key("schema_first") .booleanType() .defaultValue(false) .withDescription("Parse Schema First from table"); Option TABLE = Options.key("table") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Full Table Name"); Option TABLE_COMMENT = Options.key("comment") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Table Comment"); Option DATABASE_NAME = Options.key("database_name") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Database Name"); Option SCHEMA_NAME = Options.key("schema_name") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Table Name"); Option TABLE_NAME = Options.key("table_name") .stringType() .noDefaultValue() .withDescription("SeaTunnel Schema Table Name"); Option> PARTITION_KEYS = Options.key("partition_keys") .listType() .noDefaultValue() .withDescription( "SeaTunnel Schema Partition Keys, used to specify partition keys for table creation"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/options/table/TableSchemaOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.options.table; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.common.constants.MetaLakeType; import java.util.List; import java.util.Map; public interface TableSchemaOptions { Option> SCHEMA = Options.key("schema") .type(new TypeReference>() {}) .noDefaultValue() .withDescription("SeaTunnel Schema"); Option>> TABLE_CONFIGS = Options.key("tables_configs") .type(new TypeReference>>() {}) .noDefaultValue() .withDescription( "SeaTunnel Multi Table Schema, acts on structured and unstructured data sources. " + "such as file, assert, mongodb, jdbc, paimon, doris, etc"); Option METALAKE_TYPE = Options.key("metalake_type") .enumType(MetaLakeType.class) .defaultValue(MetaLakeType.GRAVITINO) .withDescription("Metadata lake type, for example: gravitino"); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.serialization; import org.apache.seatunnel.common.utils.SerializationUtils; import java.io.IOException; import java.io.Serializable; public class DefaultSerializer implements Serializer { @Override public byte[] serialize(T obj) throws IOException { if (obj != null) { return SerializationUtils.serialize((Serializable) obj); } else { return null; } } @Override public T deserialize(byte[] serialized) throws IOException { if (serialized == null) { return null; } return SerializationUtils.deserialize(serialized); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DeserializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.serialization; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.io.IOException; import java.io.Serializable; public interface DeserializationSchema extends Serializable { /** * Deserializes the byte message. * * @param message The message, as a byte array. * @return The deserialized message as an SeaTunnel Row (null if the message cannot be * deserialized). */ T deserialize(byte[] message) throws IOException; default void deserialize(byte[] message, Collector out) throws IOException { T deserialize = deserialize(message); if (deserialize != null) { out.collect(deserialize); } } SeaTunnelDataType getProducedType(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/SerializationSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.serialization; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import java.io.Serializable; public interface SerializationSchema extends Serializable { /** * Serializes the incoming element to a specified type. * * @param element The incoming element to be serialized * @return The serialized element. */ byte[] serialize(SeaTunnelRow element); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/Serializer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.serialization; import java.io.IOException; public interface Serializer { /** * Serializes the given object. * * @param obj The object to serialize. * @return The serialized data (bytes). * @throws IOException Thrown, if the serialization fails. */ byte[] serialize(T obj) throws IOException; /** * De-serializes the given data (bytes). * * @param serialized The serialized data * @return The deserialized object * @throws IOException Thrown, if the deserialization fails. */ T deserialize(byte[] serialized) throws IOException; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DataSaveMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; /** * The SaveMode for the Sink connectors that use table or other table structures to organize data */ public enum DataSaveMode { // Preserve database structure and delete data DROP_DATA, // Preserve database structure, preserve data APPEND_DATA, // User defined processing CUSTOM_PROCESSING, // When there exist data, an error will be reported ERROR_WHEN_DATA_EXISTS } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DefaultSaveModeHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.table.catalog.Catalog; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import lombok.extern.slf4j.Slf4j; import javax.annotation.Nonnull; import javax.annotation.Nullable; import java.util.Optional; import static org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode.SINK_TABLE_NOT_EXIST; import static org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode.SOURCE_ALREADY_HAS_DATA; @Slf4j public class DefaultSaveModeHandler implements SaveModeHandler { @Nonnull public SchemaSaveMode schemaSaveMode; @Nonnull public DataSaveMode dataSaveMode; @Nonnull public Catalog catalog; @Nonnull public TablePath tablePath; @Nullable public CatalogTable catalogTable; @Nullable public String customSql; private boolean isNewTableCreated = false; public DefaultSaveModeHandler( SchemaSaveMode schemaSaveMode, DataSaveMode dataSaveMode, Catalog catalog, CatalogTable catalogTable, String customSql) { this( schemaSaveMode, dataSaveMode, catalog, catalogTable.getTableId().toTablePath(), catalogTable, customSql); } public DefaultSaveModeHandler( SchemaSaveMode schemaSaveMode, DataSaveMode dataSaveMode, Catalog catalog, TablePath tablePath, CatalogTable catalogTable, String customSql) { this.schemaSaveMode = schemaSaveMode; this.dataSaveMode = dataSaveMode; this.catalog = catalog; this.tablePath = tablePath; this.catalogTable = catalogTable; this.customSql = customSql; } @Override public void open() { catalog.open(); } @Override public void handleSchemaSaveMode() { switch (schemaSaveMode) { case RECREATE_SCHEMA: recreateSchema(); break; case CREATE_SCHEMA_WHEN_NOT_EXIST: createSchemaWhenNotExist(); break; case ERROR_WHEN_SCHEMA_NOT_EXIST: errorWhenSchemaNotExist(); break; case IGNORE: break; default: throw new UnsupportedOperationException("Unsupported save mode: " + schemaSaveMode); } } @Override public void handleDataSaveMode() { switch (dataSaveMode) { case DROP_DATA: keepSchemaDropData(); break; case APPEND_DATA: keepSchemaAndData(); break; case CUSTOM_PROCESSING: customProcessing(); break; case ERROR_WHEN_DATA_EXISTS: errorWhenDataExists(); break; default: throw new UnsupportedOperationException("Unsupported save mode: " + dataSaveMode); } } @Override public void handleSchemaSaveModeWithRestore() { if (SchemaSaveMode.ERROR_WHEN_SCHEMA_NOT_EXIST == schemaSaveMode) { errorWhenSchemaNotExist(); } else if (SchemaSaveMode.CREATE_SCHEMA_WHEN_NOT_EXIST == schemaSaveMode || SchemaSaveMode.RECREATE_SCHEMA == schemaSaveMode) { createSchemaWhenNotExist(); } } protected void recreateSchema() { if (tableExists()) { dropTable(); } createTable(); } protected void createSchemaWhenNotExist() { if (!tableExists()) { createTable(); } } protected void errorWhenSchemaNotExist() { if (!tableExists()) { throw new SeaTunnelRuntimeException(SINK_TABLE_NOT_EXIST, "The sink table not exist"); } } protected void keepSchemaDropData() { if (tableExists() && !isNewTableCreated) { truncateTable(); } } protected void keepSchemaAndData() {} protected void customProcessing() { executeCustomSql(); } protected void errorWhenDataExists() { if (dataExists()) { throw new SeaTunnelRuntimeException( SOURCE_ALREADY_HAS_DATA, "The target data source already has data"); } } protected boolean tableExists() { return catalog.tableExists(tablePath); } protected void dropTable() { try { log.info( "Dropping table {} with action {}", tablePath, catalog.previewAction( Catalog.ActionType.DROP_TABLE, tablePath, Optional.empty())); } catch (UnsupportedOperationException ignore) { log.info("Dropping table {}", tablePath); } catalog.dropTable(tablePath, true); } protected void createTablePreCheck() { if (!catalog.databaseExists(tablePath.getDatabaseName())) { try { log.info( "Creating database {} with action {}", tablePath.getDatabaseName(), catalog.previewAction( Catalog.ActionType.CREATE_DATABASE, tablePath, Optional.empty())); } catch (UnsupportedOperationException ignore) { log.info("Creating database {}", tablePath.getDatabaseName()); } catalog.createDatabase(tablePath, true); } try { log.info( "Creating table {} with action {}", tablePath, catalog.previewAction( Catalog.ActionType.CREATE_TABLE, tablePath, Optional.ofNullable(catalogTable))); } catch (UnsupportedOperationException ignore) { log.info("Creating table {}", tablePath); } } protected void createTable() { createTablePreCheck(); catalog.createTable(tablePath, catalogTable, true); isNewTableCreated = true; } protected void truncateTable() { try { log.info( "Truncating table {} with action {}", tablePath, catalog.previewAction( Catalog.ActionType.TRUNCATE_TABLE, tablePath, Optional.empty())); } catch (UnsupportedOperationException ignore) { log.info("Truncating table {}", tablePath); } catalog.truncateTable(tablePath, true); } protected boolean dataExists() { return catalog.isExistsData(tablePath); } protected void executeCustomSql() { log.info("Executing custom SQL for table {} with SQL: {}", tablePath, customSql); catalog.executeSql(tablePath, customSql); } @Override public TablePath getHandleTablePath() { return tablePath; } @Override public Catalog getHandleCatalog() { return catalog; } @Override public SchemaSaveMode getSchemaSaveMode() { return schemaSaveMode; } @Override public DataSaveMode getDataSaveMode() { return dataSaveMode; } @Override public void close() throws Exception { catalog.close(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/DefaultSinkWriterContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.common.metrics.AbstractMetricsContext; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.DefaultEventProcessor; import org.apache.seatunnel.api.event.EventListener; /** The default {@link SinkWriter.Context} implement class. */ public class DefaultSinkWriterContext implements SinkWriter.Context { private final int subtask; private final int numberOfParallelSubtasks; private final EventListener eventListener; public DefaultSinkWriterContext(int subtask, int parallelism) { this(subtask, parallelism, new DefaultEventProcessor()); } public DefaultSinkWriterContext(String jobId, int subtask, int parallelism) { this(subtask, parallelism, new DefaultEventProcessor(jobId)); } public DefaultSinkWriterContext( int subtask, int numberOfParallelSubtasks, EventListener eventListener) { this.subtask = subtask; this.numberOfParallelSubtasks = numberOfParallelSubtasks; this.eventListener = eventListener; } @Override public int getIndexOfSubtask() { return subtask; } public int getNumberOfParallelSubtasks() { return numberOfParallelSubtasks; } @Override public MetricsContext getMetricsContext() { // TODO Waiting for Flink and Spark to implement MetricsContext // https://github.com/apache/seatunnel/issues/3431 return new AbstractMetricsContext() {}; } @Override public EventListener getEventListener() { return eventListener; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/MultiTableResourceManager.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.util.Optional; /** The multi table resource manager */ public interface MultiTableResourceManager { default Optional getSharedResource() { return Optional.empty(); } default void close() {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SaveModeExecuteLocation.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; public enum SaveModeExecuteLocation { @Deprecated CLIENT, CLUSTER } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SaveModeExecuteWrapper.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import lombok.extern.slf4j.Slf4j; @Slf4j public class SaveModeExecuteWrapper { public SaveModeExecuteWrapper(SaveModeHandler handler) { this.handler = handler; } public void execute() { log.info( "Executing save mode for table: {}, with SchemaSaveMode: {}, DataSaveMode: {} using Catalog: {}", handler.getHandleTablePath(), handler.getSchemaSaveMode(), handler.getDataSaveMode(), handler.getHandleCatalog().name()); handler.handleSaveMode(); } private final SaveModeHandler handler; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SaveModeHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.table.catalog.Catalog; import org.apache.seatunnel.api.table.catalog.TablePath; public interface SaveModeHandler extends AutoCloseable { void open(); void handleSchemaSaveMode(); void handleDataSaveMode(); void handleSchemaSaveModeWithRestore(); SchemaSaveMode getSchemaSaveMode(); DataSaveMode getDataSaveMode(); TablePath getHandleTablePath(); Catalog getHandleCatalog(); default void handleSaveMode() { handleSchemaSaveMode(); handleDataSaveMode(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SaveModePlaceHolder.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.util.Arrays; import java.util.Optional; public enum SaveModePlaceHolder { ROWTYPE_PRIMARY_KEY("rowtype_primary_key", "primary keys"), ROWTYPE_UNIQUE_KEY("rowtype_unique_key", "unique keys"), ROWTYPE_DUPLICATE_KEY("rowtype_duplicate_key", "duplicate keys"), ROWTYPE_FIELDS("rowtype_fields", "fields"), TABLE("table", "table"), DATABASE("database", "database"), COMMENT("comment", "comment"), /** @deprecated instead by {@link #TABLE} todo remove this enum */ @Deprecated TABLE_NAME("table_name", "table name"); private String keyValue; private String display; private static final String REPLACE_PLACE_HOLDER = "\\$\\{%s\\}"; private static final String PLACE_HOLDER = "${%s}"; SaveModePlaceHolder(String keyValue, String display) { this.keyValue = keyValue; this.display = display; } public static String getDisplay(String placeholder) { Optional saveModePlaceHolderEnumOptional = Arrays.stream(SaveModePlaceHolder.values()) .filter( saveModePlaceHolderEnum -> placeholder.equals( saveModePlaceHolderEnum.getPlaceHolder())) .findFirst(); if (saveModePlaceHolderEnumOptional.isPresent()) { return saveModePlaceHolderEnumOptional.get().display; } throw new RuntimeException(String.format("Not support the placeholder: %s", placeholder)); } public String getPlaceHolderKey() { return this.keyValue; } public String getPlaceHolder() { return String.format(PLACE_HOLDER, getPlaceHolderKey()); } public String getReplacePlaceHolder() { return String.format(REPLACE_PLACE_HOLDER, getPlaceHolderKey()); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SchemaSaveMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; public enum SchemaSaveMode { // Will create when the table does not exist, delete and rebuild when the table is saved RECREATE_SCHEMA, // Will Created when the table does not exist, skipped when the table is saved CREATE_SCHEMA_WHEN_NOT_EXIST, // Error will be reported when the table does not exist ERROR_WHEN_SCHEMA_NOT_EXIST, // Ignore creation IGNORE } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SeaTunnelSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.common.PluginIdentifierInterface; import org.apache.seatunnel.api.common.SeaTunnelPluginLifeCycle; import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.source.SeaTunnelJobAware; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import java.io.IOException; import java.io.Serializable; import java.util.List; import java.util.Optional; /** * The SeaTunnel sink interface, developer should implement this class when create a sink connector. * * @param The data class by sink accept. Only support {@link * org.apache.seatunnel.api.table.type.SeaTunnelRow} at now. * @param The state should be saved when job execute, this class should implement interface * {@link Serializable}. * @param The commit message class return by {@link SinkWriter#prepareCommit()}, then * {@link SinkCommitter} or {@link SinkAggregatedCommitter} and handle it, this class should * implement interface {@link Serializable}. * @param The aggregated commit message class, combine by {@link * CommitInfoT}. {@link SinkAggregatedCommitter} handle it, this class should implement * interface {@link Serializable}. */ public interface SeaTunnelSink extends Serializable, PluginIdentifierInterface, SeaTunnelPluginLifeCycle, SeaTunnelJobAware { /** * Set the row type info of sink row data. This method will be automatically called by * translation. * * @deprecated instead by {@link org.apache.seatunnel.api.table.factory.Factory} * @param seaTunnelRowType The row type info of sink. */ @Deprecated default void setTypeInfo(SeaTunnelRowType seaTunnelRowType) { throw new UnsupportedOperationException("setTypeInfo method is not supported"); } /** * Get the data type of the records consumed by this sink. * * @deprecated instead by {@link org.apache.seatunnel.api.table.factory.Factory} * @return SeaTunnel data type. */ @Deprecated default SeaTunnelDataType getConsumedType() { throw new UnsupportedOperationException("getConsumedType method is not supported"); } /** * This method will be called to creat {@link SinkWriter} * * @param context The sink context * @return Return sink writer instance * @throws IOException throws IOException when createWriter failed. */ SinkWriter createWriter(SinkWriter.Context context) throws IOException; default SinkWriter restoreWriter( SinkWriter.Context context, List states) throws IOException { return createWriter(context); } /** * Get {@link StateT} serializer. So that {@link StateT} can be transferred across processes * * @return Serializer of {@link StateT} */ default Optional> getWriterStateSerializer() { return Optional.empty(); } /** * This method will be called to create {@link SinkCommitter} * * @return Return sink committer instance * @throws IOException throws IOException when createCommitter failed. */ default Optional> createCommitter() throws IOException { return Optional.empty(); } /** * Get {@link CommitInfoT} serializer. So that {@link CommitInfoT} can be transferred across * processes * * @return Serializer of {@link CommitInfoT} */ default Optional> getCommitInfoSerializer() { return Optional.empty(); } /** * This method will be called to create {@link SinkAggregatedCommitter} * * @return Return sink aggregated committer instance * @throws IOException throws IOException when createAggregatedCommitter failed. */ default Optional> createAggregatedCommitter() throws IOException { return Optional.empty(); } /** * Get {@link AggregatedCommitInfoT} serializer. So that {@link AggregatedCommitInfoT} can be * transferred across processes * * @return Serializer of {@link AggregatedCommitInfoT} */ default Optional> getAggregatedCommitInfoSerializer() { return Optional.empty(); } /** * Get the catalog table of the sink. * * @return Optional of catalog table. */ default Optional getWriteCatalogTable() { return Optional.empty(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkAggregatedCommitter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.io.IOException; import java.io.Serializable; import java.util.List; /** * The committer combine taskManager/Worker Commit message. Then commit it uses {@link * SinkAggregatedCommitter#commit(List)}. This class will execute in single thread. * *

    See Also {@link SinkCommitter} * * @param The type of commit message. * @param The type of commit message after combine. */ public interface SinkAggregatedCommitter extends Serializable { /** * init sink aggregated committer, this method will be called not once. Each retry will call * this. */ default void init() {}; /** Re-commit message to third party data receiver, The method need to achieve idempotency. */ default List restoreCommit( List aggregatedCommitInfo) throws IOException { return commit(aggregatedCommitInfo); } /** * Commit message to third party data receiver, The method need to achieve idempotency. * * @param aggregatedCommitInfo The list of combine commit message. * @return The commit message which need retry. * @throws IOException throw IOException when commit failed. */ List commit(List aggregatedCommitInfo) throws IOException; /** * The logic about how to combine commit message. * * @param commitInfos The list of commit message. * @return The commit message after combine. */ AggregatedCommitInfoT combine(List commitInfos); /** * If {@link #commit(List)} failed, this method will be called (**Only** on Spark engine at * now). * * @param aggregatedCommitInfo The list of combine commit message. * @throws Exception throw Exception when abort failed. */ void abort(List aggregatedCommitInfo) throws Exception; /** * Close this resource. * * @throws IOException throw IOException when close failed. */ void close() throws IOException; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkCommitter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.io.IOException; import java.io.Serializable; import java.util.List; /** * The committer to commit message. We strongly recommend implementing {@link * SinkAggregatedCommitter} first, as the current version of {@link SinkAggregatedCommitter} can * provide more consistent behavior. * *

    See Also {@link SinkAggregatedCommitter} * * @param The type of commit message. */ public interface SinkCommitter extends Serializable { /** * Commit message to third party data receiver, The method need to achieve idempotency. * * @param commitInfos The list of commit message * @return The commit message need retry. * @throws IOException throw IOException when commit failed. */ List commit(List commitInfos) throws IOException; /** * Abort the transaction, this method will be called (**Only** on Spark engine) when the commit * is failed. * * @param commitInfos The list of commit message, used to abort the commit. * @throws IOException throw IOException when close failed. */ void abort(List commitInfos) throws IOException; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.EventListener; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import java.io.IOException; import java.io.Serializable; import java.util.Collections; import java.util.List; import java.util.Optional; /** * The sink writer use to write data to third party data receiver. This class will run on * taskManger/Worker. * * @param The data class by sink accept. Only support {@link * org.apache.seatunnel.api.table.type.SeaTunnelRow} at now. * @param The type of commit message. * @param The type of state. */ public interface SinkWriter { /** * write data to third party data receiver. * * @param element the data need be written. * @throws IOException throw IOException when write data failed. */ void write(T element) throws IOException; /** @deprecated instead by {@link SupportSchemaEvolutionSinkWriter} TODO: remove this method */ @Deprecated default void applySchemaChange(SchemaChangeEvent event) throws IOException {} /** * prepare the commit, will be called before {@link #snapshotState(long checkpointId)}. If you * need to use 2pc, you can return the commit info in this method, and receive the commit info * in {@link SinkCommitter#commit(List)}. If this method failed (by throw exception), **Only** * Spark engine will call {@link #abortPrepare()} * * @return the commit info need to commit */ @Deprecated Optional prepareCommit() throws IOException; /** * prepare the commit, will be called before {@link #snapshotState(long checkpointId)}. If you * need to use 2pc, you can return the commit info in this method, and receive the commit info * in {@link SinkCommitter#commit(List)}. If this method failed (by throw exception), **Only** * Spark engine will call {@link #abortPrepare()} * * @param checkpointId checkpointId * @return the commit info need to commit * @throws IOException If fail to prepareCommit */ default Optional prepareCommit(long checkpointId) throws IOException { return prepareCommit(); } /** * @return The writer's state. * @throws IOException if fail to snapshot writer's state. */ default List snapshotState(long checkpointId) throws IOException { return Collections.emptyList(); } /** * Used to abort the {@link #prepareCommit()}, if the prepareCommit failed, there is no * CommitInfoT, so the rollback work cannot be done by {@link SinkCommitter}. But we can use * this method to rollback side effects of {@link #prepareCommit()}. Only use it in Spark engine * at now. */ void abortPrepare(); /** * call it when SinkWriter close * * @throws IOException if close failed */ void close() throws IOException; interface Context extends Serializable { /** @return The index of this subtask. */ int getIndexOfSubtask(); /** @return parallelism of this writer. */ default int getNumberOfParallelSubtasks() { return 1; } /** @return metricsContext of this reader. */ MetricsContext getMetricsContext(); /** * Get the {@link EventListener} of this writer. * * @return */ EventListener getEventListener(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportMultiTableSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; /** The Sink Connectors which support multi table should implement this interface */ public interface SupportMultiTableSink {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportMultiTableSinkAggregatedCommitter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; /** The Sink Aggregated Committer which support multi table should implement this interface */ public interface SupportMultiTableSinkAggregatedCommitter extends SupportResourceShare {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportMultiTableSinkWriter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.util.Optional; /** The Sink Connector Writer which support multi table should implement this interface */ public interface SupportMultiTableSinkWriter extends SupportResourceShare { /** * The primary key index of the table in SeaTunnelRow, use it to make sure the same key value * will be written to the same sink writer */ default Optional primaryKey() { return Optional.empty(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportResourceShare.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; public interface SupportResourceShare { default MultiTableResourceManager initMultiTableResourceManager( int tableSize, int queueSize) { return null; } default void setMultiTableResourceManager( MultiTableResourceManager multiTableResourceManager, int queueIndex) {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportSaveMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.util.Optional; /** The Sink Connectors which support schema and data SaveMode should implement this interface */ public interface SupportSaveMode { String DATA_SAVE_MODE_KEY = "data_save_mode"; String SCHEMA_SAVE_MODE_KEY = "schema_save_mode"; // This method defines the return of a specific save_mode handler Optional getSaveModeHandler(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportSchemaEvolutionSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.table.schema.SchemaChangeType; import java.util.List; public interface SupportSchemaEvolutionSink { /** * The sink connector supports schema evolution types. * * @return the supported schema change types */ List supports(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SupportSchemaEvolutionSinkWriter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import java.io.IOException; public interface SupportSchemaEvolutionSinkWriter { /** * apply schema change to third party data receiver. * * @param event * @throws IOException */ void applySchemaChange(SchemaChangeEvent event) throws IOException; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/TablePlaceholder.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import java.util.HashSet; import java.util.Set; public enum TablePlaceholder { // Placeholder ${database_name} or${database_name:default_value} REPLACE_DATABASE_NAME_KEY("database_name"), // Placeholder ${schema_name} or${schema_name:default_value} REPLACE_SCHEMA_NAME_KEY("schema_name"), // Placeholder ${schema_full_name} or${schema_full_name:default_value} REPLACE_SCHEMA_FULL_NAME_KEY("schema_full_name"), // Placeholder ${table_name} or${table_name:default_value} REPLACE_TABLE_NAME_KEY("table_name"), // Placeholder ${table_full_name} or${table_full_name:default_value} REPLACE_TABLE_FULL_NAME_KEY("table_full_name"), // Placeholder ${primary_key} or${primary_key:default_value} REPLACE_PRIMARY_KEY("primary_key"), // Placeholder ${unique_key} or${unique_key:default_value} REPLACE_UNIQUE_KEY("unique_key"), // Placeholder ${field_names} or${field_names:default_value} REPLACE_FIELD_NAMES_KEY("field_names"), // Placeholder ${partition_keys} or${partition_keys:default_value} REPLACE_PARTITION_KEYS_KEY("partition_keys"); private static Set PLACEHOLDER_KEYS = new HashSet<>(); static { // O(1) complexity, using static to load all system placeholders for (TablePlaceholder placeholder : TablePlaceholder.values()) { PLACEHOLDER_KEYS.add(placeholder.getPlaceholder()); } } private final String key; TablePlaceholder(String placeholder) { this.key = placeholder; } public String getPlaceholder() { return key; } public static boolean isSystemPlaceholder(String str) { return PLACEHOLDER_KEYS.contains(str); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/TablePlaceholderProcessor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.shade.org.apache.commons.lang3.ObjectUtils; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TableSchema; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import static org.apache.seatunnel.common.utils.PlaceholderUtils.replacePlaceholders; public class TablePlaceholderProcessor { public static final String NAME_DELIMITER = "."; public static final String FIELD_DELIMITER = ","; private static String replaceTableIdentifier( String placeholder, TableIdentifier identifier, String defaultValue) { placeholder = replacePlaceholders( placeholder, TablePlaceholder.REPLACE_DATABASE_NAME_KEY.getPlaceholder(), identifier.getDatabaseName(), defaultValue); placeholder = replacePlaceholders( placeholder, TablePlaceholder.REPLACE_SCHEMA_NAME_KEY.getPlaceholder(), identifier.getSchemaName(), defaultValue); placeholder = replacePlaceholders( placeholder, TablePlaceholder.REPLACE_TABLE_NAME_KEY.getPlaceholder(), identifier.getTableName(), defaultValue); List fullPath = new ArrayList<>(); if (identifier.getDatabaseName() != null) { fullPath.add(identifier.getDatabaseName()); } if (identifier.getSchemaName() != null) { fullPath.add(identifier.getSchemaName()); } if (!fullPath.isEmpty()) { placeholder = replacePlaceholders( placeholder, TablePlaceholder.REPLACE_SCHEMA_FULL_NAME_KEY.getPlaceholder(), String.join(NAME_DELIMITER, fullPath), defaultValue); } if (identifier.getTableName() != null) { fullPath.add(identifier.getTableName()); } if (!fullPath.isEmpty()) { placeholder = replacePlaceholders( placeholder, TablePlaceholder.REPLACE_TABLE_FULL_NAME_KEY.getPlaceholder(), String.join(NAME_DELIMITER, fullPath), defaultValue); } return placeholder; } public static String replaceTableIdentifier(String placeholder, TableIdentifier identifier) { return replaceTableIdentifier(placeholder, identifier, ""); } public static String replaceTablePrimaryKey(String placeholder, PrimaryKey primaryKey) { if (primaryKey != null && !primaryKey.getColumnNames().isEmpty()) { String pkFieldsString = String.join(FIELD_DELIMITER, primaryKey.getColumnNames()); return replacePlaceholders( placeholder, TablePlaceholder.REPLACE_PRIMARY_KEY.getPlaceholder(), pkFieldsString); } return placeholder; } public static String replaceTableUniqueKey( String placeholder, List constraintKeys) { Optional ukFieldsString = constraintKeys.stream() .filter( e -> e.getConstraintType() .equals(ConstraintKey.ConstraintType.UNIQUE_KEY)) .findFirst() .map( e -> e.getColumnNames().stream() .map(f -> f.getColumnName()) .collect(Collectors.joining(FIELD_DELIMITER))); if (ukFieldsString.isPresent()) { return replacePlaceholders( placeholder, TablePlaceholder.REPLACE_UNIQUE_KEY.getPlaceholder(), ukFieldsString.get()); } return placeholder; } public static String replaceTableFieldNames(String placeholder, TableSchema schema) { return replacePlaceholders( placeholder, TablePlaceholder.REPLACE_FIELD_NAMES_KEY.getPlaceholder(), String.join(FIELD_DELIMITER, schema.getFieldNames())); } public static String replaceTablePartitionKeys(String placeholder, List partitionKeys) { if (partitionKeys != null && !partitionKeys.isEmpty()) { String partitionKeysString = String.join(FIELD_DELIMITER, partitionKeys); return replacePlaceholders( placeholder, TablePlaceholder.REPLACE_PARTITION_KEYS_KEY.getPlaceholder(), partitionKeysString); } return placeholder; } public static ReadonlyConfig replaceTablePlaceholder( ReadonlyConfig config, CatalogTable table) { return replaceTablePlaceholder(config, table, Collections.emptyList()); } public static ReadonlyConfig replaceTablePlaceholder( ReadonlyConfig config, CatalogTable table, Collection excludeKeys) { Map copyOnWriteData = ObjectUtils.clone(config.getSourceMap()); for (String key : copyOnWriteData.keySet()) { if (excludeKeys.contains(key)) { continue; } Object value = copyOnWriteData.get(key); if (value != null) { if (value instanceof String) { String strValue = (String) value; strValue = replaceTableIdentifier(strValue, table.getTableId()); strValue = replaceTablePrimaryKey( strValue, table.getTableSchema().getPrimaryKey()); strValue = replaceTableUniqueKey( strValue, table.getTableSchema().getConstraintKeys()); strValue = replaceTableFieldNames(strValue, table.getTableSchema()); strValue = replaceTablePartitionKeys(strValue, table.getPartitionKeys()); copyOnWriteData.put(key, strValue); } else if (value instanceof List) { List listValue = (List) value; if (listValue.size() == 1 && listValue.get(0) instanceof String) { String strValue = (String) listValue.get(0); if (strValue.equals( "${" + TablePlaceholder.REPLACE_PRIMARY_KEY.getPlaceholder() + "}")) { strValue = replaceTablePrimaryKey( strValue, table.getTableSchema().getPrimaryKey()); listValue = Arrays.asList(strValue.split(FIELD_DELIMITER)); } else if (strValue.equals( "${" + TablePlaceholder.REPLACE_UNIQUE_KEY.getPlaceholder() + "}")) { strValue = replaceTableUniqueKey( strValue, table.getTableSchema().getConstraintKeys()); listValue = Arrays.asList(strValue.split(FIELD_DELIMITER)); } else if (strValue.equals( "${" + TablePlaceholder.REPLACE_FIELD_NAMES_KEY.getPlaceholder() + "}")) { strValue = replaceTableFieldNames(strValue, table.getTableSchema()); listValue = Arrays.asList(strValue.split(FIELD_DELIMITER)); } else if (strValue.equals( "${" + TablePlaceholder.REPLACE_PARTITION_KEYS_KEY .getPlaceholder() + "}")) { List partitionKeys = table.getPartitionKeys(); if (partitionKeys != null && !partitionKeys.isEmpty()) { listValue = new ArrayList<>(partitionKeys); } } copyOnWriteData.put(key, listValue); } } } } return ReadonlyConfig.fromMap(copyOnWriteData); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/event/WriterCloseEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.event.LifecycleEvent; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor public class WriterCloseEvent implements LifecycleEvent { private long createdTime; private String jobId; private EventType eventType = EventType.LIFECYCLE_WRITER_CLOSE; public WriterCloseEvent() { this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableAggregatedCommitInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import lombok.AllArgsConstructor; import lombok.Getter; import java.io.Serializable; import java.util.Map; @Getter @AllArgsConstructor public class MultiTableAggregatedCommitInfo implements Serializable { private Map commitInfo; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableCommitInfo.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import lombok.AllArgsConstructor; import lombok.Getter; import java.io.Serializable; import java.util.concurrent.ConcurrentMap; @Getter @AllArgsConstructor public class MultiTableCommitInfo implements Serializable { private ConcurrentMap commitInfo; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.common.JobContext; import org.apache.seatunnel.api.options.SinkConnectorCommonOptions; import org.apache.seatunnel.api.serialization.DefaultSerializer; import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.sink.SeaTunnelSink; import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; import org.apache.seatunnel.api.sink.SinkCommitter; import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.sink.SupportSchemaEvolutionSink; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.factory.MultiTableFactoryContext; import org.apache.seatunnel.api.table.schema.SchemaChangeType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import lombok.Getter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; public class MultiTableSink implements SeaTunnelSink< SeaTunnelRow, MultiTableState, MultiTableCommitInfo, MultiTableAggregatedCommitInfo>, SupportSchemaEvolutionSink { @Getter private final Map sinks; private final int replicaNum; public MultiTableSink(MultiTableFactoryContext context) { this.sinks = context.getSinks(); this.replicaNum = context.getOptions().get(SinkConnectorCommonOptions.MULTI_TABLE_SINK_REPLICA); } @Override public String getPluginName() { return "MultiTableSink"; } @Override public SinkWriter createWriter( SinkWriter.Context context) throws IOException { Map> writers = new HashMap<>(); Map sinkWritersContext = new HashMap<>(); for (int i = 0; i < replicaNum; i++) { for (TablePath tablePath : sinks.keySet()) { SeaTunnelSink sink = sinks.get(tablePath); int index = context.getIndexOfSubtask() * replicaNum + i; String tableIdentifier = tablePath.toString(); writers.put( SinkIdentifier.of(tableIdentifier, index), sink.createWriter(new SinkContextProxy(index, replicaNum, context))); sinkWritersContext.put(SinkIdentifier.of(tableIdentifier, index), context); } } return new MultiTableSinkWriter(writers, replicaNum, sinkWritersContext); } @Override public SinkWriter restoreWriter( SinkWriter.Context context, List states) throws IOException { Map> writers = new HashMap<>(); Map sinkWritersContext = new HashMap<>(); for (int i = 0; i < replicaNum; i++) { for (TablePath tablePath : sinks.keySet()) { SeaTunnelSink sink = sinks.get(tablePath); int index = context.getIndexOfSubtask() * replicaNum + i; SinkIdentifier sinkIdentifier = SinkIdentifier.of(tablePath.toString(), index); List state = states.stream() .map( multiTableState -> multiTableState.getStates().get(sinkIdentifier)) .filter(Objects::nonNull) .flatMap(Collection::stream) .collect(Collectors.toList()); if (state.isEmpty()) { writers.put( sinkIdentifier, sink.createWriter(new SinkContextProxy(index, replicaNum, context))); } else { writers.put( sinkIdentifier, sink.restoreWriter( new SinkContextProxy(index, replicaNum, context), state)); } sinkWritersContext.put(sinkIdentifier, context); } } return new MultiTableSinkWriter(writers, replicaNum, sinkWritersContext); } @Override public Optional> getWriterStateSerializer() { return Optional.of(new DefaultSerializer<>()); } @Override public Optional> createCommitter() throws IOException { Map> committers = new HashMap<>(); for (TablePath tablePath : sinks.keySet()) { SeaTunnelSink sink = sinks.get(tablePath); sink.createCommitter() .ifPresent( committer -> committers.put( tablePath.toString(), (SinkCommitter) committer)); } if (committers.isEmpty()) { return Optional.empty(); } return Optional.of(new MultiTableSinkCommitter(committers)); } @Override public Optional> getCommitInfoSerializer() { return Optional.of(new DefaultSerializer<>()); } @Override public Optional> createAggregatedCommitter() throws IOException { Map> aggCommitters = new HashMap<>(); for (TablePath tablePath : sinks.keySet()) { SeaTunnelSink sink = sinks.get(tablePath); Optional> sinkOptional = sink.createAggregatedCommitter(); sinkOptional.ifPresent( sinkAggregatedCommitter -> aggCommitters.put(tablePath.toString(), sinkAggregatedCommitter)); } if (aggCommitters.isEmpty()) { return Optional.empty(); } return Optional.of(new MultiTableSinkAggregatedCommitter(aggCommitters)); } public List getSinkTables() { List tablePaths = new ArrayList<>(); List values = new ArrayList<>(sinks.values()); for (int i = 0; i < values.size(); i++) { if (values.get(i).getWriteCatalogTable().isPresent()) { tablePaths.add( ((CatalogTable) values.get(i).getWriteCatalogTable().get()).getTablePath()); } else { tablePaths.add(sinks.keySet().toArray(new TablePath[0])[i]); } } return tablePaths; } @Override public Optional> getAggregatedCommitInfoSerializer() { return Optional.of(new DefaultSerializer<>()); } @Override public void setJobContext(JobContext jobContext) { sinks.values().forEach(sink -> sink.setJobContext(jobContext)); } @Override public Optional getWriteCatalogTable() { return SeaTunnelSink.super.getWriteCatalogTable(); } @Override public List supports() { SeaTunnelSink firstSink = sinks.entrySet().iterator().next().getValue(); if (firstSink instanceof SupportSchemaEvolutionSink) { return ((SupportSchemaEvolutionSink) firstSink).supports(); } return Collections.emptyList(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkAggregatedCommitter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.MultiTableResourceManager; import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; import org.apache.seatunnel.api.sink.SupportMultiTableSinkAggregatedCommitter; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; @Slf4j public class MultiTableSinkAggregatedCommitter implements SinkAggregatedCommitter { private final Map> aggCommitters; private transient MultiTableResourceManager resourceManager = null; public MultiTableSinkAggregatedCommitter( Map> aggCommitters) { this.aggCommitters = aggCommitters; } @Override public void init() { initResourceManager(); } private void initResourceManager() { for (String tableIdentifier : aggCommitters.keySet()) { SinkAggregatedCommitter aggCommitter = aggCommitters.get(tableIdentifier); if (!(aggCommitter instanceof SupportMultiTableSinkAggregatedCommitter)) { break; } resourceManager = ((SupportMultiTableSinkAggregatedCommitter) aggCommitter) .initMultiTableResourceManager(aggCommitters.size(), 1); break; } for (SinkAggregatedCommitter aggCommitter : aggCommitters.values()) { aggCommitter.init(); if (resourceManager != null) { ((SupportMultiTableSinkAggregatedCommitter) aggCommitter) .setMultiTableResourceManager(resourceManager, 0); } } } @Override public List commit( List aggregatedCommitInfo) throws IOException { List errorList = new ArrayList<>(); for (String sinkIdentifier : aggCommitters.keySet()) { SinkAggregatedCommitter sinkCommitter = aggCommitters.get(sinkIdentifier); if (sinkCommitter != null) { List commitInfo = aggregatedCommitInfo.stream() .map( multiTableCommitInfo -> multiTableCommitInfo .getCommitInfo() .get(sinkIdentifier)) .filter(Objects::nonNull) .collect(Collectors.toList()); List errCommitList = sinkCommitter.commit(commitInfo); if (errCommitList.size() == 0) { continue; } for (int i = 0; i < errCommitList.size(); i++) { if (errorList.size() < i + 1) { errorList.add(i, new MultiTableAggregatedCommitInfo(new HashMap<>())); } errorList.get(i).getCommitInfo().put(sinkIdentifier, errCommitList.get(i)); } } } return errorList; } @Override public MultiTableAggregatedCommitInfo combine(List commitInfos) { Map commitInfo = new HashMap<>(); for (String sinkIdentifier : aggCommitters.keySet()) { SinkAggregatedCommitter sinkCommitter = aggCommitters.get(sinkIdentifier); if (sinkCommitter != null) { List commits = commitInfos.stream() .flatMap( multiTableCommitInfo -> multiTableCommitInfo.getCommitInfo().entrySet() .stream() .filter( m -> m.getKey() .getTableIdentifier() .equals( sinkIdentifier)) .map(Map.Entry::getValue)) .collect(Collectors.toList()); commitInfo.put(sinkIdentifier, sinkCommitter.combine(commits)); } } return new MultiTableAggregatedCommitInfo(commitInfo); } @Override public void abort(List aggregatedCommitInfo) throws Exception { Throwable firstE = null; for (String sinkIdentifier : aggCommitters.keySet()) { SinkAggregatedCommitter sinkCommitter = aggCommitters.get(sinkIdentifier); if (sinkCommitter != null) { List commitInfo = aggregatedCommitInfo.stream() .map( multiTableCommitInfo -> multiTableCommitInfo .getCommitInfo() .get(sinkIdentifier)) .filter(Objects::nonNull) .collect(Collectors.toList()); try { sinkCommitter.abort(commitInfo); } catch (Throwable e) { log.error("abort sink committer error", e); if (firstE == null) { firstE = e; } } } } if (firstE != null) { throw new RuntimeException(firstE); } } @Override public void close() throws IOException { Throwable firstE = null; for (String sinkIdentifier : aggCommitters.keySet()) { SinkAggregatedCommitter sinkCommitter = aggCommitters.get(sinkIdentifier); if (sinkCommitter != null) { try { sinkCommitter.close(); } catch (Throwable e) { log.error("close sink committer error", e); if (firstE == null) { firstE = e; } } } } if (firstE != null) { throw new RuntimeException(firstE); } try { if (resourceManager != null) { resourceManager.close(); } } catch (Throwable e) { log.error("close resourceManager error", e); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkCommitter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.SinkCommitter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; public class MultiTableSinkCommitter implements SinkCommitter { private final Map> sinkCommitters; public MultiTableSinkCommitter(Map> sinkCommitters) { this.sinkCommitters = sinkCommitters; } @Override public List commit(List commitInfos) throws IOException { for (String sinkIdentifier : sinkCommitters.keySet()) { SinkCommitter sinkCommitter = sinkCommitters.get(sinkIdentifier); if (sinkCommitter != null) { List commitInfo = commitInfos.stream() .flatMap( multiTableCommitInfo -> multiTableCommitInfo.getCommitInfo().entrySet() .stream() .filter( entry -> entry.getKey() .getTableIdentifier() .equals( sinkIdentifier))) .map(Map.Entry::getValue) .collect(Collectors.toList()); sinkCommitter.commit(commitInfo); } } return new ArrayList<>(); } @Override public void abort(List commitInfos) throws IOException { for (String sinkIdentifier : sinkCommitters.keySet()) { SinkCommitter sinkCommitter = sinkCommitters.get(sinkIdentifier); if (sinkCommitter != null) { List commitInfo = commitInfos.stream() .flatMap( multiTableCommitInfo -> multiTableCommitInfo.getCommitInfo().entrySet() .stream() .filter( entry -> entry.getKey() .getTableIdentifier() .equals( sinkIdentifier))) .map(Map.Entry::getValue) .collect(Collectors.toList()); sinkCommitter.abort(commitInfo); } } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.table.connector.TableSink; import org.apache.seatunnel.api.table.factory.Factory; import org.apache.seatunnel.api.table.factory.MultiTableFactoryContext; import org.apache.seatunnel.api.table.factory.TableSinkFactory; import org.apache.seatunnel.api.table.factory.TableSinkFactoryContext; import com.google.auto.service.AutoService; @AutoService(Factory.class) public class MultiTableSinkFactory implements TableSinkFactory { @Override public String factoryIdentifier() { return "MultiTableSink"; } @Override public TableSink createSink(TableSinkFactoryContext context) { if (context instanceof MultiTableFactoryContext) { return () -> new MultiTableSink((MultiTableFactoryContext) context); } else { throw new UnsupportedOperationException( "MultiTableSinkFactory only support MultiTableFactoryContext"); } } @Override public OptionRule optionRule() { return OptionRule.builder().build(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkWriter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.MultiTableResourceManager; import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.sink.SupportMultiTableSinkWriter; import org.apache.seatunnel.api.sink.SupportSchemaEvolutionSinkWriter; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.tracing.MDCTracer; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Random; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @Slf4j public class MultiTableSinkWriter implements SinkWriter, SupportSchemaEvolutionSinkWriter { private final Map> sinkWriters; private final Map sinkWritersContext; private final Map> sinkPrimaryKeys = new HashMap<>(); private final List>> sinkWritersWithIndex; private final List runnable = new ArrayList<>(); private final Random random = new Random(); private final List> blockingQueues = new ArrayList<>(); private final ExecutorService executorService; private MultiTableResourceManager resourceManager; private volatile boolean submitted = false; public MultiTableSinkWriter( Map> sinkWriters, int queueSize, Map sinkWritersContext) { this.sinkWriters = sinkWriters; this.sinkWritersContext = sinkWritersContext; AtomicInteger cnt = new AtomicInteger(0); executorService = MDCTracer.tracing( Executors.newFixedThreadPool( // we use it in `MultiTableWriterRunnable` and `prepare commit // task`, so it // should be double. queueSize * 2, runnable -> { Thread thread = new Thread(runnable); thread.setDaemon(true); thread.setName( "st-multi-table-sink-writer" + "-" + cnt.incrementAndGet()); return thread; })); sinkWritersWithIndex = new ArrayList<>(); for (int i = 0; i < queueSize; i++) { BlockingQueue queue = new LinkedBlockingQueue<>(1024); Map> tableIdWriterMap = new HashMap<>(); ConcurrentMap> sinkIdentifierMap = new ConcurrentHashMap<>(); int queueIndex = i; sinkWriters.entrySet().stream() .filter(entry -> entry.getKey().getIndex() % queueSize == queueIndex) .forEach( entry -> { tableIdWriterMap.put( entry.getKey().getTableIdentifier(), entry.getValue()); sinkIdentifierMap.put(entry.getKey(), entry.getValue()); }); sinkWritersWithIndex.add(sinkIdentifierMap); blockingQueues.add(queue); MultiTableWriterRunnable r = new MultiTableWriterRunnable(tableIdWriterMap, queue); runnable.add(r); } log.info("init multi table sink writer, queue size: {}", queueSize); initResourceManager(queueSize); } private void initResourceManager(int queueSize) { for (SinkIdentifier tableIdentifier : sinkWriters.keySet()) { SinkWriter sink = sinkWriters.get(tableIdentifier); resourceManager = ((SupportMultiTableSinkWriter) sink) .initMultiTableResourceManager(sinkWriters.size(), queueSize); break; } for (int i = 0; i < sinkWritersWithIndex.size(); i++) { Map> writerMap = sinkWritersWithIndex.get(i); for (Map.Entry> entry : writerMap.entrySet()) { SupportMultiTableSinkWriter sink = ((SupportMultiTableSinkWriter) entry.getValue()); sink.setMultiTableResourceManager(resourceManager, i); sinkPrimaryKeys.put(entry.getKey().getTableIdentifier(), sink.primaryKey()); } } } private void subSinkErrorCheck() { for (MultiTableWriterRunnable writerRunnable : runnable) { if (writerRunnable.getThrowable() != null) { throw new RuntimeException( String.format( "table %s sink throw error", writerRunnable.getCurrentTableId()), writerRunnable.getThrowable()); } } } @Override public void applySchemaChange(SchemaChangeEvent event) throws IOException { subSinkErrorCheck(); for (int i = 0; i < sinkWritersWithIndex.size(); i++) { for (Map.Entry> sinkWriterEntry : sinkWritersWithIndex.get(i).entrySet()) { if (sinkWriterEntry .getKey() .getTableIdentifier() .equals(event.tablePath().getFullName())) { log.info( "Start apply schema change for table {} sub-writer {}", sinkWriterEntry.getKey().getTableIdentifier(), sinkWriterEntry.getKey().getIndex()); synchronized (runnable.get(i)) { if (sinkWriterEntry.getValue() instanceof SupportSchemaEvolutionSinkWriter) { ((SupportSchemaEvolutionSinkWriter) sinkWriterEntry.getValue()) .applySchemaChange(event); } else { // TODO remove deprecated method sinkWriterEntry.getValue().applySchemaChange(event); } } log.info( "Finish apply schema change for table {} sub-writer {}", sinkWriterEntry.getKey().getTableIdentifier(), sinkWriterEntry.getKey().getIndex()); } } } } @Override public void write(SeaTunnelRow element) throws IOException { if (element != null && element.getOptions() != null) { if (element.getOptions().containsKey("flush_event") || element.getOptions().containsKey("schema_change_event")) { log.debug("Skipping schema change event row: {}", element.getOptions().keySet()); return; } } if (!submitted) { submitted = true; runnable.forEach(executorService::submit); } subSinkErrorCheck(); Optional primaryKey = sinkPrimaryKeys.get(element.getTableId()); try { if ((primaryKey == null && sinkPrimaryKeys.size() == 1) || (primaryKey != null && !primaryKey.isPresent())) { int index = random.nextInt(blockingQueues.size()); BlockingQueue queue = blockingQueues.get(index); while (!queue.offer(element, 500, TimeUnit.MILLISECONDS)) { subSinkErrorCheck(); } } else if (primaryKey == null) { throw new RuntimeException( "multi table sink can not write table: " + element.getTableId()); } else { Object object = element.getField(primaryKey.get()); int index = 0; if (object != null) { index = Math.abs(object.hashCode()) % blockingQueues.size(); } BlockingQueue queue = blockingQueues.get(index); while (!queue.offer(element, 500, TimeUnit.MILLISECONDS)) { subSinkErrorCheck(); } } } catch (InterruptedException e) { throw new IOException(e); } } @Override public List snapshotState(long checkpointId) throws IOException { checkQueueRemain(); subSinkErrorCheck(); List multiTableStates = new ArrayList<>(); MultiTableState multiTableState = new MultiTableState(new HashMap<>()); for (int i = 0; i < sinkWritersWithIndex.size(); i++) { for (Map.Entry> sinkWriterEntry : sinkWritersWithIndex.get(i).entrySet()) { synchronized (runnable.get(i)) { List states = sinkWriterEntry.getValue().snapshotState(checkpointId); multiTableState.getStates().put(sinkWriterEntry.getKey(), states); } } } multiTableStates.add(multiTableState); return multiTableStates; } @Override public Optional prepareCommit() throws IOException { return Optional.empty(); } @Override public Optional prepareCommit(long checkpointId) throws IOException { checkQueueRemain(); subSinkErrorCheck(); MultiTableCommitInfo multiTableCommitInfo = new MultiTableCommitInfo(new ConcurrentHashMap<>()); List> futures = new ArrayList<>(); for (int i = 0; i < sinkWritersWithIndex.size(); i++) { int subWriterIndex = i; futures.add( executorService.submit( () -> { synchronized (runnable.get(subWriterIndex)) { for (Map.Entry> sinkWriterEntry : sinkWritersWithIndex .get(subWriterIndex) .entrySet()) { Optional commit; try { SinkWriter sinkWriter = sinkWriterEntry.getValue(); commit = sinkWriter.prepareCommit(checkpointId); } catch (IOException e) { throw new RuntimeException(e); } commit.ifPresent( o -> multiTableCommitInfo .getCommitInfo() .put(sinkWriterEntry.getKey(), o)); } } })); } for (Future future : futures) { try { future.get(); } catch (Exception e) { throw new RuntimeException(e); } } if (multiTableCommitInfo.getCommitInfo().isEmpty()) { return Optional.empty(); } return Optional.of(multiTableCommitInfo); } @Override public void abortPrepare() { Throwable firstE = null; try { checkQueueRemain(); } catch (Exception e) { firstE = e; } for (int i = 0; i < sinkWritersWithIndex.size(); i++) { synchronized (runnable.get(i)) { for (SinkWriter sinkWriter : sinkWritersWithIndex.get(i).values()) { try { sinkWriter.abortPrepare(); } catch (Throwable e) { if (firstE == null) { firstE = e; } log.error("abortPrepare error", e); } } } } if (firstE != null) { throw new RuntimeException(firstE); } } @Override public void close() throws IOException { // The variables used in lambda expressions should be final or valid final, so they are // modified to arrays final Throwable[] firstE = {null}; try { checkQueueRemain(); } catch (Exception e) { firstE[0] = e; } executorService.shutdownNow(); for (int i = 0; i < sinkWritersWithIndex.size(); i++) { synchronized (runnable.get(i)) { Map> sinkIdentifierSinkWriterMap = sinkWritersWithIndex.get(i); sinkIdentifierSinkWriterMap.forEach( (identifier, sinkWriter) -> { try { sinkWriter.close(); } catch (Throwable e) { if (firstE[0] == null) { firstE[0] = e; } log.error("close error", e); } }); } } try { if (resourceManager != null) { resourceManager.close(); } } catch (Throwable e) { log.error("close resourceManager error", e); } if (firstE[0] != null) { throw new RuntimeException(firstE[0]); } } private void checkQueueRemain() { try { for (BlockingQueue blockingQueue : blockingQueues) { while (!blockingQueue.isEmpty()) { Thread.sleep(100); subSinkErrorCheck(); } } } catch (InterruptedException e) { throw new RuntimeException(e); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableState.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import lombok.AllArgsConstructor; import lombok.Getter; import java.io.Serializable; import java.util.List; import java.util.Map; @Getter @AllArgsConstructor public class MultiTableState implements Serializable { private Map> states; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableWriterRunnable.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import lombok.extern.slf4j.Slf4j; import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; @Slf4j public class MultiTableWriterRunnable implements Runnable { private final Map> tableIdWriterMap; private final BlockingQueue queue; private volatile Throwable throwable; private volatile String currentTableId; public MultiTableWriterRunnable( Map> tableIdWriterMap, BlockingQueue queue) { this.tableIdWriterMap = tableIdWriterMap; this.queue = queue; } @Override public void run() { while (true) { SeaTunnelRow row = null; try { row = queue.poll(100, TimeUnit.MILLISECONDS); if (row == null) { continue; } // control rows used for schema evolution / coordination // are represented as SeaTunnelRow with zero fields (arity == 0) if (row.getArity() == 0) { log.debug( "Skip control SeaTunnelRow with zero arity in MultiTableWriterRunnable: {}", row); continue; } SinkWriter writer = tableIdWriterMap.get(row.getTableId()); if (writer == null) { if (tableIdWriterMap.size() == 1) { writer = tableIdWriterMap.values().stream().findFirst().get(); currentTableId = tableIdWriterMap.keySet().stream().findFirst().get(); } else { throw new RuntimeException( "MultiTableWriterRunnable can't find writer for tableId: " + row.getTableId()); } } else { currentTableId = row.getTableId(); } synchronized (this) { writer.write(row); } } catch (InterruptedException e) { // When the job finished, the thread will be interrupted, so we ignore this // exception. throwable = e; break; } catch (Throwable e) { log.error( String.format("MultiTableWriterRunnable error when write row %s", row), e); throwable = e; break; } } } public Throwable getThrowable() { return throwable; } public String getCurrentTableId() { return currentTableId; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/SinkContextProxy.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.EventListener; import org.apache.seatunnel.api.sink.SinkWriter; public class SinkContextProxy implements SinkWriter.Context { private final int index; private final int replicaNum; private final SinkWriter.Context context; public SinkContextProxy(int index, int replicaNum, SinkWriter.Context context) { this.index = index; this.replicaNum = replicaNum; this.context = context; } @Override public int getIndexOfSubtask() { return index; } @Override public int getNumberOfParallelSubtasks() { return context.getNumberOfParallelSubtasks() * replicaNum; } @Override public MetricsContext getMetricsContext() { return context.getMetricsContext(); } @Override public EventListener getEventListener() { return context.getEventListener(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/multitablesink/SinkIdentifier.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; @Getter @EqualsAndHashCode public class SinkIdentifier implements Serializable { // Use jvm default serial version uid private static final long serialVersionUID = 5378869132870084393L; private final String tableIdentifier; private final int index; private SinkIdentifier(String tableIdentifier, int index) { this.tableIdentifier = tableIdentifier; this.index = index; } public static SinkIdentifier of(String tableIdentifier, int index) { return new SinkIdentifier(tableIdentifier, index); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Boundedness.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; /** * Used to define the boundedness of a source. In batch mode, the source is {@link * Boundedness#BOUNDED}. In streaming mode, the source is {@link Boundedness#UNBOUNDED}. */ public enum Boundedness { /** A BOUNDED stream is a stream with finite records. */ BOUNDED, /** A UNBOUNDED stream is a stream with infinite records. */ UNBOUNDED } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Collector.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; /** * A {@link Collector} is used to collect data from {@link SourceReader}. * * @param data type. */ public interface Collector { void collect(T record); default void markSchemaChangeBeforeCheckpoint() {} default void collect(SchemaChangeEvent event) {} default void markSchemaChangeAfterCheckpoint() {} /** * Returns the checkpoint lock. * * @return The object to use as the lock */ Object getCheckpointLock(); default boolean isEmptyThisPollNext() { return false; } default void resetEmptyThisPollNext() {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SeaTunnelJobAware.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.common.JobContext; /** This interface defines the runtime environment of the SeaTunnel job. */ public interface SeaTunnelJobAware { default void setJobContext(JobContext jobContext) { // nothing } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SeaTunnelSource.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.common.PluginIdentifierInterface; import org.apache.seatunnel.api.common.SeaTunnelPluginLifeCycle; import org.apache.seatunnel.api.serialization.DefaultSerializer; import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.io.Serializable; import java.util.List; /** * The interface for Source. It acts like a factory class that helps construct the {@link * SourceSplitEnumerator} and {@link SourceReader} and corresponding serializers. * * @param The type of records produced by the source. * @param The type of splits handled by the source. * @param The type of checkpoint states. */ public interface SeaTunnelSource extends Serializable, PluginIdentifierInterface, SeaTunnelPluginLifeCycle, SeaTunnelJobAware { /** * Get the boundedness of this source. * * @return the boundedness of this source. */ Boundedness getBoundedness(); /** * Get the data type of the records produced by this source. * * @deprecated Please use {@link #getProducedCatalogTables} * @return SeaTunnel data type. */ @Deprecated default SeaTunnelDataType getProducedType() { return (SeaTunnelDataType) getProducedCatalogTables().get(0).getSeaTunnelRowType(); } /** * Get the catalog tables output by this source, It is recommended that all connectors implement * this method instead of {@link #getProducedType}. CatalogTable contains more information to * help downstream support more accurate and complete synchronization capabilities. */ default List getProducedCatalogTables() { throw new UnsupportedOperationException( "getProducedCatalogTables method has not been implemented."); } /** * Create source reader, used to produce data. * * @param readerContext reader context. * @return source reader. * @throws Exception when create reader failed. */ SourceReader createReader(SourceReader.Context readerContext) throws Exception; /** * Create split serializer, use to serialize/deserialize split generated by {@link * SourceSplitEnumerator}. * * @return split serializer. */ default Serializer getSplitSerializer() { return new DefaultSerializer<>(); } /** * Create source split enumerator, used to generate splits. This method will be called only once * when start a source. * * @param enumeratorContext enumerator context. * @return source split enumerator. * @throws Exception when create enumerator failed. */ SourceSplitEnumerator createEnumerator( SourceSplitEnumerator.Context enumeratorContext) throws Exception; /** * Create source split enumerator, used to generate splits. This method will be called when * restore from checkpoint. * * @param enumeratorContext enumerator context. * @param checkpointState checkpoint state. * @return source split enumerator. * @throws Exception when create enumerator failed. */ SourceSplitEnumerator restoreEnumerator( SourceSplitEnumerator.Context enumeratorContext, StateT checkpointState) throws Exception; /** * Create enumerator state serializer, used to serialize/deserialize checkpoint state. * * @return enumerator state serializer. */ default Serializer getEnumeratorStateSerializer() { return new DefaultSerializer<>(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import java.io.Serializable; /** * A base class for the events passed between the {@link SourceReader} and {@link * SourceSplitEnumerator}. */ public interface SourceEvent extends Serializable {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceReader.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.EventListener; import org.apache.seatunnel.api.state.CheckpointListener; import java.io.IOException; import java.util.List; /** * The {@link SourceReader} is used to generate source record, and it will be running at worker. * * @param record type. * @param source split type. */ public interface SourceReader extends AutoCloseable, CheckpointListener { /** Open the source reader. */ void open() throws Exception; /** * Called to close the reader, in case it holds on to any resources, like threads or network * connections. */ @Override void close() throws IOException; /** * Generate the next batch of records. * * @param output output collector. * @throws Exception if error occurs. */ void pollNext(Collector output) throws Exception; /** * Get the current split checkpoint state by checkpointId. * *

    If the source is bounded, checkpoint is not triggered. * * @param checkpointId checkpoint Id. * @return split checkpoint state. * @throws Exception if error occurs. */ List snapshotState(long checkpointId) throws Exception; /** * Add the split checkpoint state to reader. * * @param splits split checkpoint state. */ void addSplits(List splits); /** * This method is called when the reader is notified that it will not receive any further * splits. * *

    It is triggered when the enumerator calls {@link * SourceSplitEnumerator.Context#signalNoMoreSplits(int)} with the reader's parallel subtask. */ void handleNoMoreSplits(); /** * Handle the source event form {@link SourceSplitEnumerator}. * * @param sourceEvent source event. */ default void handleSourceEvent(SourceEvent sourceEvent) {} interface Context { /** @return The index of this subtask. */ int getIndexOfSubtask(); /** @return boundedness of this reader. */ Boundedness getBoundedness(); /** Indicator that the input has reached the end of data. Then will cancel this reader. */ void signalNoMoreElement(); /** * Sends a split request to the source's {@link SourceSplitEnumerator}. This will result in * a call to the {@link SourceSplitEnumerator#handleSplitRequest(int)} method, with this * reader's parallel subtask id and the hostname where this reader runs. */ void sendSplitRequest(); /** * Send a source event to the source coordinator. * * @param sourceEvent the source event to coordinator. */ void sendSourceEventToEnumerator(SourceEvent sourceEvent); /** @return metricsContext of this reader. */ MetricsContext getMetricsContext(); /** * Get the {@link EventListener} of this reader. * * @return */ EventListener getEventListener(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceSplit.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import java.io.Serializable; /** An interface for all the Split types to extend. */ public interface SourceSplit extends Serializable { /** * Get the split id of this source split. * * @return id of this source split. */ String splitId(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SourceSplitEnumerator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.EventListener; import org.apache.seatunnel.api.state.CheckpointListener; import java.io.IOException; import java.util.Collections; import java.util.List; import java.util.Set; /** * The {@link SourceSplitEnumerator} is responsible for enumerating the splits of a source. It will * run at master. * * @param source split type * @param source split state type */ public interface SourceSplitEnumerator extends AutoCloseable, CheckpointListener { void open(); /** * Executes engine setup steps in a fixed, non‑concurrent sequence. * *

    Before the first {@link #run()} invocation, methods are called in this order: * *

      *
    1. {@link #open()} *
    2. {@link #addSplitsBack(List, int)} *
    3. {@link #registerReader(int)} *
    * *

    {@implNote The engine guarantees this invocation order and ensures there are no * concurrency issues between these calls.} */ void run() throws Exception; /** * Called to close the enumerator, in case it holds on to any resources, like threads or network * connections. */ @Override void close() throws IOException; /** * Add a split back to the split enumerator. It will only happen when a {@link SourceReader} * fails and there are splits assigned to it after the last successful checkpoint. * * @param splits The split to add back to the enumerator for reassignment. * @param subtaskId The id of the subtask to which the returned splits belong. */ void addSplitsBack(List splits, int subtaskId); int currentUnassignedSplitSize(); void handleSplitRequest(int subtaskId); void registerReader(int subtaskId); /** * Used to snapshot the state of the enumerator. * *

    Concurrency Consideration:
    * This method and {@link #run()} can be invoked concurrently by different threads. * Systematically manage shared state access to prevent race conditions. */ StateT snapshotState(long checkpointId) throws Exception; /** * Handle the source event from {@link SourceReader}. * * @param subtaskId The id of the subtask to which the source event from. * @param sourceEvent source event. */ default void handleSourceEvent(int subtaskId, SourceEvent sourceEvent) {} interface Context { int currentParallelism(); /** * Get the currently registered readers. The mapping is from subtask id to the reader info. * * @return the currently registered readers. */ Set registeredReaders(); /** Assign the splits. */ void assignSplit(int subtaskId, List splits); /** * Assigns a single split. * *

    When assigning multiple splits, it is more efficient to assign all of them in a single * call to the {@link #assignSplit} method. * * @param split The new split * @param subtaskId The index of the operator's parallel subtask that shall receive the * split. */ default void assignSplit(int subtaskId, SplitT split) { assignSplit(subtaskId, Collections.singletonList(split)); } /** * Signals a subtask that it will not receive any further split. * * @param subtask The index of the operator's parallel subtask that shall be signaled it * will not receive any further split. */ void signalNoMoreSplits(int subtask); /** * Send a source event to a source reader. The source reader is identified by its subtask * id. * * @param subtaskId the subtask id of the source reader to send this event to. * @param event the source event to send. */ void sendEventToSourceReader(int subtaskId, SourceEvent event); /** @return metricsContext of this reader. */ MetricsContext getMetricsContext(); /** * Get the {@link EventListener} of this enumerator. * * @return */ EventListener getEventListener(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SupportColumnProjection.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; /** Mark whether the Source connector supports ColumnProjection */ public interface SupportColumnProjection {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SupportCoordinate.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; /** Used to mark whether the interface supports coordination. */ public interface SupportCoordinate {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SupportParallelism.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; /** Mark whether the Source connector supports parallelism */ public interface SupportParallelism {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/SupportSchemaEvolution.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source; import org.apache.seatunnel.api.table.schema.SchemaChangeType; import java.util.List; public interface SupportSchemaEvolution { /** * Whether the source connector supports schema evolution. * * @return the supported schema change types */ List supports(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/event/EnumeratorCloseEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.event.LifecycleEvent; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor public class EnumeratorCloseEvent implements LifecycleEvent { private long createdTime; private String jobId; private EventType eventType = EventType.LIFECYCLE_ENUMERATOR_CLOSE; public EnumeratorCloseEvent() { this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/event/EnumeratorOpenEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.event.LifecycleEvent; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor public class EnumeratorOpenEvent implements LifecycleEvent { private long createdTime; private String jobId; private EventType eventType = EventType.LIFECYCLE_ENUMERATOR_OPEN; public EnumeratorOpenEvent() { this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/event/MessageDelayedEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source.event; import org.apache.seatunnel.api.event.Event; import org.apache.seatunnel.api.event.EventType; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor @NoArgsConstructor public class MessageDelayedEvent implements Event { private long createdTime; private String jobId; private EventType eventType = EventType.READER_MESSAGE_DELAYED; private long delayTime; private String record; public MessageDelayedEvent(long delayTime) { this(delayTime, null); } public MessageDelayedEvent(long delayTime, String record) { this.delayTime = delayTime; this.record = record; this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/event/ReaderCloseEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.event.LifecycleEvent; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor public class ReaderCloseEvent implements LifecycleEvent { private long createdTime; private String jobId; private EventType eventType = EventType.LIFECYCLE_READER_CLOSE; public ReaderCloseEvent() { this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/source/event/ReaderOpenEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.source.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.event.LifecycleEvent; import lombok.AllArgsConstructor; import lombok.Getter; import lombok.Setter; import lombok.ToString; @Getter @Setter @ToString @AllArgsConstructor public class ReaderOpenEvent implements LifecycleEvent { private long createdTime; private String jobId; private EventType eventType = EventType.LIFECYCLE_READER_OPEN; public ReaderOpenEvent() { this.createdTime = System.currentTimeMillis(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/state/CheckpointListener.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.state; /** If the data flow is bounded, checkpoint is not triggered. */ public interface CheckpointListener { void notifyCheckpointComplete(long checkpointId) throws Exception; default void notifyCheckpointAborted(long checkpointId) throws Exception {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/AbstractSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import lombok.AccessLevel; import lombok.Data; import lombok.Getter; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; /** Represent a physical table schema. */ @Data public class AbstractSchema implements Serializable { private static final long serialVersionUID = 1L; protected final List columns; @Getter(AccessLevel.PRIVATE) protected final List columnNames; public AbstractSchema(List columns) { this.columns = columns; this.columnNames = columns.stream().map(Column::getName).collect(Collectors.toList()); } // Lombok requires a no-arg constructor for @Data annotation to work properly private AbstractSchema() { this.columns = new ArrayList<>(); this.columnNames = new ArrayList<>(); } public SeaTunnelRowType toPhysicalRowDataType() { SeaTunnelDataType[] fieldTypes = columns.stream() .filter(Column::isPhysical) .map(Column::getDataType) .toArray(SeaTunnelDataType[]::new); String[] fields = columns.stream() .filter(Column::isPhysical) .map(Column::getName) .toArray(String[]::new); return new SeaTunnelRowType(fields, fieldTypes); } public String[] getFieldNames() { return columnNames.toArray(new String[0]); } public int indexOf(String columnName) { return columnNames.indexOf(columnName); } public Column getColumn(String columnName) { return columns.get(indexOf(columnName)); } public boolean contains(String columnName) { return columnNames.contains(columnName); } public List getColumns() { return Collections.unmodifiableList(columns); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Catalog.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.table.catalog.exception.CatalogException; import org.apache.seatunnel.api.table.catalog.exception.DatabaseAlreadyExistException; import org.apache.seatunnel.api.table.catalog.exception.DatabaseNotExistException; import org.apache.seatunnel.api.table.catalog.exception.TableAlreadyExistException; import org.apache.seatunnel.api.table.catalog.exception.TableNotExistException; import org.apache.seatunnel.api.table.factory.Factory; import org.apache.seatunnel.common.exception.CommonError; import org.apache.seatunnel.common.exception.CommonErrorCode; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * Interface for reading and writing table metadata from SeaTunnel. Each connector need to contain * the implementation of Catalog. */ public interface Catalog extends AutoCloseable { default Optional getFactory() { return Optional.empty(); } /** * Open the catalog. Used for any required preparation in initialization phase. * * @throws CatalogException in case of any runtime exception */ void open() throws CatalogException; /** * Close the catalog when it is no longer needed and release any resource that it might be * holding. * * @throws CatalogException in case of any runtime exception */ void close() throws CatalogException; /** Get the name of the catalog. */ String name(); // -------------------------------------------------------------------------------------------- // database // -------------------------------------------------------------------------------------------- /** * Get the name of the default database for this catalog. The default database will be the * current database for the catalog when user's session doesn't specify a current database. The * value probably comes from configuration, will not change for the life time of the catalog * instance. * * @return the name of the current database * @throws CatalogException in case of any runtime exception */ String getDefaultDatabase() throws CatalogException; /** * Check if a database exists in this catalog. * * @param databaseName Name of the database * @return true if the given database exists in the catalog false otherwise * @throws CatalogException in case of any runtime exception */ boolean databaseExists(String databaseName) throws CatalogException; /** * Get the names of all databases in this catalog. * * @return a list of the names of all databases * @throws CatalogException in case of any runtime exception */ List listDatabases() throws CatalogException; // -------------------------------------------------------------------------------------------- // table // -------------------------------------------------------------------------------------------- /** * Get names of all tables under this database. An empty list is returned if none exists. * * @return a list of the names of all tables in this database * @throws CatalogException in case of any runtime exception */ List listTables(String databaseName) throws CatalogException, DatabaseNotExistException; /** * Check if a table exist in this catalog. * * @param tablePath Path of the table * @return true if the given table exists in the catalog false otherwise * @throws CatalogException in case of any runtime exception */ boolean tableExists(TablePath tablePath) throws CatalogException; /** * Return a {@link CatalogTable} identified by the given {@link TablePath}. The framework will * resolve the metadata objects when necessary. * * @param tablePath Path of the table * @return The requested table * @throws CatalogException in case of any runtime exception */ CatalogTable getTable(TablePath tablePath) throws CatalogException, TableNotExistException; /** * Return a {@link CatalogTable} identified by the given {@link TablePath} and field names. The * framework will resolve the metadata objects when necessary. * * @param tablePath Path of the table * @param fieldNames The field names need read * @return The requested table * @throws CatalogException in case of any runtime exception */ default CatalogTable getTable(TablePath tablePath, List fieldNames) throws CatalogException, TableNotExistException { throw CommonError.unsupportedOperation( name(), "get table with tablePath " + tablePath + ", fieldNames: " + fieldNames); } default List getTables(ReadonlyConfig config) throws CatalogException { // Get the list of specified tables List tableNames = config.get(ConnectorCommonOptions.TABLE_NAMES); if (tableNames != null && !tableNames.isEmpty()) { Iterator tablePaths = tableNames.stream().map(TablePath::of).filter(this::tableExists).iterator(); return buildCatalogTablesWithErrorCheck(tablePaths); } // Get the list of table pattern String tablePatternStr = config.get(ConnectorCommonOptions.TABLE_PATTERN); if (StringUtils.isBlank(tablePatternStr)) { return Collections.emptyList(); } Pattern databasePattern = Pattern.compile(config.get(ConnectorCommonOptions.DATABASE_PATTERN)); Pattern tablePattern = Pattern.compile(config.get(ConnectorCommonOptions.TABLE_PATTERN)); List allDatabase = this.listDatabases(); allDatabase.removeIf(s -> !databasePattern.matcher(s).matches()); List tablePaths = new ArrayList<>(); for (String databaseName : allDatabase) { List paths = this.listTablePaths(databaseName); tablePaths.addAll( paths.stream() .filter( path -> tablePattern .matcher( path.getDatabaseName() + "." + path.getSchemaAndTableName()) .matches()) .collect(Collectors.toList())); } return buildCatalogTablesWithErrorCheck(tablePaths.iterator()); } default List listTablePaths(String databaseName) throws CatalogException, DatabaseNotExistException { List tableNames = listTables(databaseName); return tableNames.stream() .map( tableName -> { String[] parts = tableName.split("\\."); if (parts.length > 1) { return TablePath.of(databaseName, parts[0], parts[1]); } else { return TablePath.of(databaseName, null, tableName); } }) .collect(Collectors.toList()); } default List buildCatalogTablesWithErrorCheck(Iterator tablePaths) { Map> unsupportedTable = new LinkedHashMap<>(); List catalogTables = new ArrayList<>(); while (tablePaths.hasNext()) { try { catalogTables.add(getTable(tablePaths.next())); } catch (SeaTunnelRuntimeException e) { if (e.getSeaTunnelErrorCode() .equals(CommonErrorCode.GET_CATALOG_TABLE_WITH_UNSUPPORTED_TYPE_ERROR)) { unsupportedTable.put( e.getParams().get("tableName"), e.getParamsValueAsMap("fieldWithDataTypes")); } else { throw e; } } } if (!unsupportedTable.isEmpty()) { throw CommonError.getCatalogTablesWithUnsupportedType(name(), unsupportedTable); } return catalogTables; } default void buildColumnsWithErrorCheck( TablePath tablePath, TableSchema.Builder builder, Iterator keys, Function getColumn) { Map unsupported = new LinkedHashMap<>(); while (keys.hasNext()) { try { builder.column(getColumn.apply(keys.next())); } catch (SeaTunnelRuntimeException e) { if (e.getSeaTunnelErrorCode() .equals(CommonErrorCode.CONVERT_TO_SEATUNNEL_TYPE_ERROR_SIMPLE)) { unsupported.put(e.getParams().get("field"), e.getParams().get("dataType")); } else { throw e; } } } if (!unsupported.isEmpty()) { throw CommonError.getCatalogTableWithUnsupportedType( name(), tablePath.getFullName(), unsupported); } } /** * Create a new table in this catalog. * * @param tablePath Path of the table * @param table The table definition * @param ignoreIfExists Flag to specify behavior when a table with the given name already exist * @throws TableAlreadyExistException thrown if the table already exists in the catalog and * ignoreIfExists is false * @throws DatabaseNotExistException thrown if the database in tablePath doesn't exist in the * catalog * @throws CatalogException in case of any runtime exception */ void createTable(TablePath tablePath, CatalogTable table, boolean ignoreIfExists) throws TableAlreadyExistException, DatabaseNotExistException, CatalogException; /** * Create a new table in this catalog. * * @param tablePath Path of the table * @param table The table definition * @param ignoreIfExists Flag to specify behavior when a table with the given name already exist * @param createIndex If you want to create index or not * @throws TableAlreadyExistException thrown if the table already exists in the catalog and * ignoreIfExists is false * @throws DatabaseNotExistException thrown if the database in tablePath doesn't exist in the * catalog * @throws CatalogException in case of any runtime exception */ default void createTable( TablePath tablePath, CatalogTable table, boolean ignoreIfExists, boolean createIndex) throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { createTable(tablePath, table, ignoreIfExists); } /** * Drop an existing table in this catalog. * * @param tablePath Path of the table * @param ignoreIfNotExists Flag to specify behavior when a table with the given name doesn't * exist * @throws TableNotExistException thrown if the table doesn't exist in the catalog and * ignoreIfNotExists is false * @throws CatalogException in case of any runtime exception */ void dropTable(TablePath tablePath, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException; void createDatabase(TablePath tablePath, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException; void dropDatabase(TablePath tablePath, boolean ignoreIfNotExists) throws DatabaseNotExistException, CatalogException; /** * Truncate an existing table data in this catalog. * * @param tablePath Path of the table * @param ignoreIfNotExists Flag to specify behavior when a table with the given name doesn't * exist * @throws TableNotExistException thrown if the table doesn't exist in the catalog and * ignoreIfNotExists is false * @throws CatalogException in case of any runtime exception */ default void truncateTable(TablePath tablePath, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException {} default boolean isExistsData(TablePath tablePath) { return false; } default void executeSql(TablePath tablePath, String sql) {} default PreviewResult previewAction( ActionType actionType, TablePath tablePath, Optional catalogTable) { throw new UnsupportedOperationException("Preview action is not supported"); } enum ActionType { CREATE_TABLE, CREATE_DATABASE, DROP_TABLE, DROP_DATABASE, TRUNCATE_TABLE } // todo: Support for update table metadata } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTable.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** Represent the table metadata in SeaTunnel. */ public final class CatalogTable implements Serializable { private static final long serialVersionUID = 1L; /** Used to identify the table. */ private final TableIdentifier tableId; /** The table schema metadata. */ private final TableSchema tableSchema; private final Map options; private final List partitionKeys; private final MetadataSchema metadata; private final String comment; private final String catalogName; public static CatalogTable of(TableIdentifier tableId, CatalogTable catalogTable) { CatalogTable newTable = catalogTable.copy(); return new CatalogTable( tableId, newTable.getTableSchema(), newTable.getOptions(), newTable.getPartitionKeys(), newTable.getComment(), newTable.getCatalogName(), newTable.getMetadataSchema()); } public static CatalogTable of( TableIdentifier tableId, TableSchema tableSchema, Map options, List partitionKeys, String comment) { return new CatalogTable( tableId, tableSchema, options, partitionKeys, comment, tableId.getCatalogName(), MetadataSchema.builder().build()); } public static CatalogTable of( TableIdentifier tableId, TableSchema tableSchema, Map options, List partitionKeys, String comment, String catalogName) { return new CatalogTable( tableId, tableSchema, options, partitionKeys, comment, catalogName, MetadataSchema.builder().build()); } public static CatalogTable of( TableIdentifier tableId, TableSchema tableSchema, Map options, List partitionKeys, String comment, String catalogName, MetadataSchema metadata) { return new CatalogTable( tableId, tableSchema, options, partitionKeys, comment, catalogName, metadata); } public static CatalogTable withMetadata(CatalogTable catalogTable, MetadataSchema metadata) { return new CatalogTable( catalogTable.getTableId(), catalogTable.getTableSchema(), catalogTable.getOptions(), catalogTable.getPartitionKeys(), catalogTable.getComment(), catalogTable.getCatalogName(), metadata); } private CatalogTable( TableIdentifier tableId, TableSchema tableSchema, Map options, List partitionKeys, String comment, String catalogName, MetadataSchema metadata) { this.tableId = tableId; this.tableSchema = tableSchema; // Make sure the options and partitionKeys are mutable this.options = new HashMap<>(options); this.partitionKeys = new ArrayList<>(partitionKeys); this.comment = comment; this.catalogName = catalogName; this.metadata = metadata; } public CatalogTable copy() { return new CatalogTable( tableId.copy(), tableSchema.copy(), new HashMap<>(options), new ArrayList<>(partitionKeys), comment, catalogName, metadata); } public TableIdentifier getTableId() { return tableId; } public TablePath getTablePath() { return tableId.toTablePath(); } public TableSchema getTableSchema() { return tableSchema; } public SeaTunnelRowType getSeaTunnelRowType() { return tableSchema.toPhysicalRowDataType(); } public Map getOptions() { return options; } public List getPartitionKeys() { return partitionKeys; } public String getComment() { return comment; } public String getCatalogName() { return catalogName; } public MetadataSchema getMetadataSchema() { return metadata; } @Override public String toString() { return "CatalogTable{" + "tableId=" + tableId + ", tableSchema=" + tableSchema + ", options=" + options + ", partitionKeys=" + partitionKeys + ", comment='" + comment + '\'' + ", catalogName='" + catalogName + '\'' + ", metadata=" + metadata + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/CatalogTableUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.table.catalog.schema.ReadonlyConfigParser; import org.apache.seatunnel.api.table.factory.FactoryUtil; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.MultipleRowType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.common.utils.SeaTunnelException; import lombok.extern.slf4j.Slf4j; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; /** Utils contains some common methods for construct CatalogTable. */ @Slf4j public class CatalogTableUtil implements Serializable { private static final SeaTunnelRowType SIMPLE_SCHEMA = new SeaTunnelRowType( new String[] {"content"}, new SeaTunnelDataType[] {BasicType.STRING_TYPE}); @Deprecated public static CatalogTable getCatalogTable(String tableName, SeaTunnelRowType rowType) { return getCatalogTable("schema", "default", null, tableName, rowType); } public static CatalogTable getCatalogTable( String catalog, String database, String schema, String tableName, SeaTunnelRowType rowType) { TableSchema.Builder schemaBuilder = TableSchema.builder(); for (int i = 0; i < rowType.getTotalFields(); i++) { PhysicalColumn column = PhysicalColumn.of( rowType.getFieldName(i), rowType.getFieldType(i), 0, true, null, null); schemaBuilder.column(column); } return CatalogTable.of( TableIdentifier.of(catalog, database, schema, tableName), schemaBuilder.build(), new HashMap<>(), new ArrayList<>(), "It is converted from RowType and only has column information."); } /** * Get catalog table from config, if schema is specified, return a catalog table with specified * schema, otherwise, return a catalog table with schema from catalog. * * @deprecated DO NOT invoke it in any new TableSourceFactory/TableSinkFactory, please directly * use TableSourceFactory/TableSinkFactory instance to get CatalogTable. We just use it to * transition the old CatalogTable creation logic. Details please check * */ @Deprecated public static List getCatalogTables( ReadonlyConfig readonlyConfig, ClassLoader classLoader) { // We use plugin_name as factoryId, so MySQL-CDC should be MySQL String factoryId = readonlyConfig.get(ConnectorCommonOptions.PLUGIN_NAME).replace("-CDC", ""); return getCatalogTables(factoryId, readonlyConfig, classLoader); } @Deprecated public static List getCatalogTables( String factoryId, ReadonlyConfig readonlyConfig, ClassLoader classLoader) { // Highest priority: specified schema Map schemaMap = readonlyConfig.get(ConnectorCommonOptions.SCHEMA); if (schemaMap != null) { if (schemaMap.isEmpty()) { throw new SeaTunnelException("Schema config can not be empty"); } CatalogTable catalogTable = CatalogTableUtil.buildWithConfig(factoryId, readonlyConfig); return Collections.singletonList(catalogTable); } Optional optionalCatalog = FactoryUtil.createOptionalCatalog( factoryId, readonlyConfig, classLoader, factoryId); return optionalCatalog .map( c -> { try (Catalog catalog = c) { long startTime = System.currentTimeMillis(); catalog.open(); List catalogTables = catalog.getTables(readonlyConfig); log.info( String.format( "Get catalog tables, cost time: %d ms", System.currentTimeMillis() - startTime)); if (catalogTables.isEmpty()) { throw new SeaTunnelException( String.format( "Can not find catalog table with factoryId [%s]", factoryId)); } return catalogTables; } }) .orElseThrow( () -> new SeaTunnelException( String.format( "Can not find catalog with factoryId [%s]", factoryId))); } public static CatalogTable buildWithConfig(Config config) { ReadonlyConfig readonlyConfig = ReadonlyConfig.fromConfig(config); return buildWithConfig(readonlyConfig); } public static SeaTunnelDataType convertToDataType( List catalogTables) { if (catalogTables.size() == 1) { return catalogTables.get(0).getTableSchema().toPhysicalRowDataType(); } else { return convertToMultipleRowType(catalogTables); } } @Deprecated private static MultipleRowType convertToMultipleRowType(List catalogTables) { Map rowTypeMap = new HashMap<>(); for (CatalogTable catalogTable : catalogTables) { String tableId = catalogTable.getTableId().toTablePath().toString(); rowTypeMap.put(tableId, catalogTable.getTableSchema().toPhysicalRowDataType()); } return new MultipleRowType(rowTypeMap); } // We need to use buildWithConfig(String catalogName, ReadonlyConfig readonlyConfig); // Since this method will not inject the correct catalogName into CatalogTable @Deprecated public static List convertDataTypeToCatalogTables( SeaTunnelDataType seaTunnelDataType, String tableId) { List catalogTables; if (seaTunnelDataType instanceof MultipleRowType) { catalogTables = new ArrayList<>(); for (String id : ((MultipleRowType) seaTunnelDataType).getTableIds()) { catalogTables.add( CatalogTableUtil.getCatalogTable( id, ((MultipleRowType) seaTunnelDataType).getRowType(id))); } } else { catalogTables = Collections.singletonList( CatalogTableUtil.getCatalogTable( tableId, (SeaTunnelRowType) seaTunnelDataType)); } return catalogTables; } public static CatalogTable buildWithConfig(ReadonlyConfig readonlyConfig) { return buildWithConfig("", readonlyConfig); } public static CatalogTable buildWithConfig(String catalogName, ReadonlyConfig readonlyConfig) { if (readonlyConfig.get(ConnectorCommonOptions.SCHEMA) == null) { throw new RuntimeException( "Schema config need option [schema], please correct your config first"); } TableSchema tableSchema = new ReadonlyConfigParser().parse(readonlyConfig); ReadonlyConfig schemaConfig = readonlyConfig .getOptional(ConnectorCommonOptions.SCHEMA) .map(ReadonlyConfig::fromMap) .orElseThrow( () -> new IllegalArgumentException("Schema config can't be null")); TablePath tablePath; if (StringUtils.isNotEmpty(schemaConfig.get(ConnectorCommonOptions.TABLE))) { tablePath = TablePath.of( schemaConfig.get(ConnectorCommonOptions.TABLE), schemaConfig.get(ConnectorCommonOptions.SCHEMA_FIRST)); } else { Optional pluginOutputIdentifierOptional = readonlyConfig.getOptional(ConnectorCommonOptions.PLUGIN_OUTPUT); tablePath = pluginOutputIdentifierOptional.map(TablePath::of).orElse(TablePath.DEFAULT); } List partitionKeys = schemaConfig .getOptional(ConnectorCommonOptions.PARTITION_KEYS) .orElseGet(Collections::emptyList); return CatalogTable.of( TableIdentifier.of(catalogName, tablePath), tableSchema, new HashMap<>(), partitionKeys, readonlyConfig.get(ConnectorCommonOptions.TABLE_COMMENT)); } public static SeaTunnelRowType buildSimpleTextSchema() { return SIMPLE_SCHEMA; } public static CatalogTable buildSimpleTextTable() { return getCatalogTable("default", buildSimpleTextSchema()); } public static CatalogTable newCatalogTable( CatalogTable catalogTable, SeaTunnelRowType seaTunnelRowType) { TableSchema tableSchema = catalogTable.getTableSchema(); Map columnMap = tableSchema.getColumns().stream() .collect(Collectors.toMap(Column::getName, Function.identity())); String[] fieldNames = seaTunnelRowType.getFieldNames(); SeaTunnelDataType[] fieldTypes = seaTunnelRowType.getFieldTypes(); List finalColumns = new ArrayList<>(); for (int i = 0; i < fieldNames.length; i++) { Column column = columnMap.get(fieldNames[i]); if (column != null) { finalColumns.add(column); } else { finalColumns.add( PhysicalColumn.of(fieldNames[i], fieldTypes[i], 0, true, null, null)); } } TableSchema finalSchema = TableSchema.builder() .columns(finalColumns) .primaryKey(tableSchema.getPrimaryKey()) .constraintKey(tableSchema.getConstraintKeys()) .build(); return CatalogTable.of( catalogTable.getTableId(), finalSchema, catalogTable.getOptions(), catalogTable.getPartitionKeys(), catalogTable.getComment(), catalogTable.getCatalogName()); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/Column.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import lombok.AllArgsConstructor; import lombok.Data; import java.io.Serializable; import java.util.Map; /** * Represent the column of {@link TableSchema}. * * @see PhysicalColumn * @see MetadataColumn */ @Data @AllArgsConstructor @SuppressWarnings("PMD.AbstractClassShouldStartWithAbstractNamingRule") public abstract class Column implements Serializable { private static final long serialVersionUID = -1L; /** column name. */ protected final String name; /** Data type of the column. */ // todo: use generic type protected final SeaTunnelDataType dataType; /** * Designated column's specified column size. * *

    For numeric data, this is the maximum precision. For character/binary data, this is the * length in bytes. * *

    Null is returned for data types where the scale is not applicable. */ protected final Long columnLength; /** * Number of digits to right of the decimal point. * *

    For decimal data, this is the maximum scale. For time/timestamp data, this is the maximum * allowed precision of the fractional seconds component. For vector data, this is the vector * dimension. * *

    Null is returned for data types where the scale is not applicable. */ protected final Integer scale; /** Does the column can be null */ protected final boolean nullable; // todo: use generic type /** The default value of the column. */ protected final Object defaultValue; protected final String comment; /** * Field type in the database For example : varchar is varchar(50),DECIMAL is DECIMAL(20,5) , * int is int Each database can customize the sourceType according to its own characteristics* */ protected final String sourceType; /** * The data type used to store the target database, typically specified in transform or sink * scenarios. */ protected String sinkType; /** your options * */ protected final Map options; // TODO Waiting for migration to complete before remove @Deprecated protected boolean isUnsigned; @Deprecated protected boolean isZeroFill; @Deprecated protected Long bitLen; @Deprecated protected Long longColumnLength; protected Column(String name, SeaTunnelDataType dataType, Long columnLength, Integer scale) { this(name, dataType, columnLength, scale, true, null, null, null, null); } protected Column( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment) { this(name, dataType, columnLength, null, nullable, defaultValue, comment, null, null); } protected Column( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sinkType, String sourceType, Map options) { this.name = name; this.dataType = dataType; this.columnLength = columnLength; this.scale = scale; this.nullable = nullable; this.defaultValue = defaultValue; this.comment = comment; this.sourceType = sourceType; this.sinkType = sinkType; this.options = options; this.bitLen = columnLength != null ? columnLength * 8 : 0; this.longColumnLength = columnLength; this.isUnsigned = false; this.isZeroFill = false; } protected Column( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sourceType, Map options) { this.name = name; this.dataType = dataType; this.columnLength = columnLength; this.scale = scale; this.nullable = nullable; this.defaultValue = defaultValue; this.comment = comment; this.sourceType = sourceType; this.options = options; // TODO Waiting for migration to complete before remove this.bitLen = columnLength != null ? columnLength * 8 : 0; this.longColumnLength = columnLength; this.isUnsigned = false; this.isZeroFill = false; } @Deprecated protected Column( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment) { this( name, dataType, columnLength == null ? null : columnLength.longValue(), nullable, defaultValue, comment); } @Deprecated protected Column( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment, String sourceType, boolean isUnsigned, boolean isZeroFill, Long bitLen, Long longColumnLength, Map options) { this.name = name; this.dataType = dataType; this.columnLength = columnLength == null ? null : columnLength.longValue(); this.scale = null; this.nullable = nullable; this.defaultValue = defaultValue; this.comment = comment; this.sourceType = sourceType; this.isUnsigned = isUnsigned; this.isZeroFill = isZeroFill; this.bitLen = bitLen; this.longColumnLength = longColumnLength; this.options = options; } /** * Returns whether the given column is a physical column of a table; neither computed nor * metadata. */ public abstract boolean isPhysical(); /** Returns a copy of the column with a replaced {@link SeaTunnelDataType}. */ public abstract Column copy(SeaTunnelDataType newType); /** Returns a copy of the column. */ public abstract Column copy(); /** Returns a copy of the column with a replaced name. */ public abstract Column rename(String newColumnName); /** Returns a copy of the column with a replaced sourceType. */ public abstract Column reSourceType(String sourceType); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/ConstraintKey.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import lombok.AllArgsConstructor; import lombok.Data; import java.io.Serializable; import java.util.List; import java.util.stream.Collectors; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @Data public class ConstraintKey implements Serializable { private static final long serialVersionUID = 1L; private final ConstraintType constraintType; private final String constraintName; private final List columnNames; private ConstraintKey( ConstraintType constraintType, String constraintName, List columnNames) { checkNotNull(constraintType, "constraintType must not be null"); this.constraintType = constraintType; this.constraintName = constraintName; this.columnNames = columnNames; } public static ConstraintKey of( ConstraintType constraintType, String constraintName, List columnNames) { return new ConstraintKey(constraintType, constraintName, columnNames); } @Data @AllArgsConstructor public static class ConstraintKeyColumn implements Serializable { private final String columnName; private final ColumnSortType sortType; public static ConstraintKeyColumn of(String columnName, ColumnSortType sortType) { return new ConstraintKeyColumn(columnName, sortType); } public ConstraintKeyColumn copy() { return ConstraintKeyColumn.of(columnName, sortType); } } public enum ConstraintType { INDEX_KEY, UNIQUE_KEY, FOREIGN_KEY, VECTOR_INDEX_KEY } public enum ColumnSortType { ASC, DESC } public ConstraintKey copy() { List collect = columnNames.stream().map(ConstraintKeyColumn::copy).collect(Collectors.toList()); return ConstraintKey.of(constraintType, constraintName, collect); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/DataTypeConvertor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.util.Map; /** * @deprecated instead by {@link org.apache.seatunnel.api.table.converter.TypeConverter} * @param */ @Deprecated public interface DataTypeConvertor { /** * Transfer the data type from connector to SeaTunnel. * * @param field The field name of the column * @param connectorDataType e.g. "int", "varchar(255)" * @return the data type of SeaTunnel */ SeaTunnelDataType toSeaTunnelType(String field, String connectorDataType); /** * Transfer the data type from connector to SeaTunnel. * * @param field The field name of the column * @param connectorDataType origin data type * @param dataTypeProperties origin data type properties, e.g. precision, scale, length * @return SeaTunnel data type */ // todo: If the origin data type contains the properties, we can remove the dataTypeProperties. SeaTunnelDataType toSeaTunnelType( String field, T connectorDataType, Map dataTypeProperties); /** * Transfer the data type from SeaTunnel to connector. * * @param field The field name of the column * @param seaTunnelDataType seaTunnel data type * @param dataTypeProperties seaTunnel data type properties, e.g. precision, scale, length * @return origin data type */ // todo: If the SeaTunnel data type contains the properties, we can remove the // dataTypeProperties. T toConnectorType( String field, SeaTunnelDataType seaTunnelDataType, Map dataTypeProperties); String getIdentity(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/InfoPreviewResult.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; public class InfoPreviewResult extends PreviewResult { private final String info; public String getInfo() { return info; } public InfoPreviewResult(String info) { super(Type.INFO); this.info = info; } @Override public String toString() { return info; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/MetadataColumn.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import lombok.EqualsAndHashCode; import lombok.ToString; /** Representation of a metadata column. */ @EqualsAndHashCode(callSuper = true) @ToString(callSuper = true) public class MetadataColumn extends Column { private static final long serialVersionUID = 1L; protected MetadataColumn( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment) { super(name, dataType, columnLength, nullable, defaultValue, comment); } public static MetadataColumn of( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment) { return new MetadataColumn(name, dataType, columnLength, nullable, defaultValue, comment); } @Override public boolean isPhysical() { return false; } @Override public Column copy(SeaTunnelDataType newType) { return MetadataColumn.of(name, newType, columnLength, nullable, defaultValue, comment); } @Override public Column copy() { return MetadataColumn.of(name, dataType, columnLength, nullable, defaultValue, comment); } @Override public Column rename(String newColumnName) { return MetadataColumn.of( newColumnName, dataType, columnLength, nullable, defaultValue, comment); } public PhysicalColumn toPhysicalColumn() { return PhysicalColumn.of( name, dataType, columnLength, scale, nullable, defaultValue, comment); } @Override public Column reSourceType(String sourceType) { throw new UnsupportedOperationException("Not implemented"); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/MetadataSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import lombok.Data; import lombok.EqualsAndHashCode; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; /** Represent a physical table schema. */ @EqualsAndHashCode(callSuper = true) @Data public final class MetadataSchema extends AbstractSchema { private static final long serialVersionUID = 1L; public MetadataSchema(List columns) { super(columns); } public static Builder builder() { return new Builder(); } public static final class Builder { private final List columns = new ArrayList<>(); public Builder columns(List columns) { this.columns.addAll(columns); return this; } public Builder column(Column column) { this.columns.add(column); return this; } public MetadataSchema build() { return new MetadataSchema(columns); } } public MetadataSchema copy() { List copyColumns = columns.stream().map(Column::copy).collect(Collectors.toList()); return MetadataSchema.builder().columns(copyColumns).build(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PhysicalColumn.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import lombok.Builder; import lombok.EqualsAndHashCode; import lombok.ToString; import java.util.HashMap; import java.util.Map; /** Representation of a physical column. */ @EqualsAndHashCode(callSuper = true) @ToString(callSuper = true) public class PhysicalColumn extends Column { private static final long serialVersionUID = 1L; protected PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale) { super(name, dataType, columnLength, scale); } public PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sinkType, String sourceType, Map options) { super( name, dataType, columnLength, scale, nullable, defaultValue, comment, sinkType, sourceType, options); } protected PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment) { super(name, dataType, columnLength, nullable, defaultValue, comment); } public PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment) { super( name, dataType, columnLength, scale, nullable, defaultValue, comment, null, new HashMap<>()); } public PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment, String sourceType, Map options) { super( name, dataType, columnLength, null, nullable, defaultValue, comment, sourceType, options); } @Builder public PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sourceType, Map options) { super( name, dataType, columnLength, scale, nullable, defaultValue, comment, sourceType, options); } @Deprecated protected PhysicalColumn( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment) { super(name, dataType, columnLength, nullable, defaultValue, comment); } @Deprecated protected PhysicalColumn( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment, String sourceType, boolean isUnsigned, boolean isZeroFill, Long bitLen, Long longColumnLength, Map options) { super( name, dataType, columnLength, nullable, defaultValue, comment, sourceType, isUnsigned, isZeroFill, bitLen, longColumnLength, options); } @Deprecated public PhysicalColumn( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sourceType, String sinkType, Map options, boolean isUnsigned, boolean isZeroFill, Long bitLen, Long longColumnLength) { super( name, dataType, columnLength, scale, nullable, defaultValue, comment, sourceType, sinkType, options, isUnsigned, isZeroFill, bitLen, longColumnLength); } public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment) { return new PhysicalColumn(name, dataType, columnLength, nullable, defaultValue, comment); } public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment) { return new PhysicalColumn( name, dataType, columnLength, scale, nullable, defaultValue, comment); } public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Long columnLength, boolean nullable, Object defaultValue, String comment, String sourceType, Map options) { return new PhysicalColumn( name, dataType, columnLength, nullable, defaultValue, comment, sourceType, options); } public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sourceType, Map options) { return new PhysicalColumn( name, dataType, columnLength, scale, nullable, defaultValue, comment, sourceType, options); } public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Long columnLength, Integer scale, boolean nullable, Object defaultValue, String comment, String sinkType, String sourceType) { return new PhysicalColumn( name, dataType, columnLength, scale, nullable, defaultValue, comment, sinkType, sourceType, null); } @Deprecated public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment) { return new PhysicalColumn(name, dataType, columnLength, nullable, defaultValue, comment); } @Deprecated public static PhysicalColumn of( String name, SeaTunnelDataType dataType, Integer columnLength, boolean nullable, Object defaultValue, String comment, String sourceType, boolean isUnsigned, boolean isZeroFill, Long bitLen, Map options, Long longColumnLength) { return new PhysicalColumn( name, dataType, columnLength, nullable, defaultValue, comment, sourceType, isUnsigned, isZeroFill, bitLen, longColumnLength, options); } @Override public boolean isPhysical() { return true; } @Override public Column copy(SeaTunnelDataType newType) { return new PhysicalColumn( name, newType, columnLength, scale, nullable, defaultValue, comment, sourceType, sinkType, options, isUnsigned, isZeroFill, bitLen, longColumnLength); } @Override public Column copy() { return new PhysicalColumn( name, dataType, columnLength, scale, nullable, defaultValue, comment, sourceType, sinkType, options, isUnsigned, isZeroFill, bitLen, longColumnLength); } @Override public Column rename(String newColumnName) { return new PhysicalColumn( newColumnName, dataType, columnLength, scale, nullable, defaultValue, comment, sourceType, sinkType, options, isUnsigned, isZeroFill, bitLen, longColumnLength); } @Override public Column reSourceType(String newSourceType) { return new PhysicalColumn( name, dataType, columnLength, scale, nullable, defaultValue, comment, newSourceType, sinkType, options, isUnsigned, isZeroFill, bitLen, longColumnLength); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PreviewResult.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; /** The result of a SQL preview action in {@link Catalog#previewAction}. */ public abstract class PreviewResult { private final Type type; public PreviewResult(Type type) { this.type = type; } public Type getType() { return type; } public enum Type { SQL, INFO, OTHER } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/PrimaryKey.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import lombok.AllArgsConstructor; import lombok.Data; import java.io.Serializable; import java.util.ArrayList; import java.util.List; @Data @AllArgsConstructor public class PrimaryKey implements Serializable { private static final long serialVersionUID = 1L; // This field is not used now private final String primaryKey; private final List columnNames; private Boolean enableAutoId; public PrimaryKey(String primaryKey, List columnNames) { this.primaryKey = primaryKey; this.columnNames = columnNames; this.enableAutoId = null; } public static boolean isPrimaryKeyField(PrimaryKey primaryKey, String fieldName) { if (primaryKey == null || primaryKey.getColumnNames() == null) { return false; } return primaryKey.getColumnNames().contains(fieldName); } public static PrimaryKey of(String primaryKey, List columnNames, Boolean autoId) { return new PrimaryKey(primaryKey, columnNames, autoId); } public static PrimaryKey of(String primaryKey, List columnNames) { return new PrimaryKey(primaryKey, columnNames); } public PrimaryKey copy() { return PrimaryKey.of(primaryKey, new ArrayList<>(columnNames)); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/SQLPreviewResult.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; public class SQLPreviewResult extends PreviewResult { private final String sql; public String getSql() { return sql; } public SQLPreviewResult(String sql) { super(Type.SQL); this.sql = sql; } @Override public String toString() { return sql; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/SeaTunnelDataTypeConvertorUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.shade.com.typesafe.config.ConfigObject; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValue; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.DecimalType; import org.apache.seatunnel.api.table.type.LocalTimeType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.PrimitiveByteArrayType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.api.table.type.SqlType; import org.apache.seatunnel.api.table.type.VectorType; import org.apache.seatunnel.common.exception.CommonError; public class SeaTunnelDataTypeConvertorUtil { /** * @param columnType column type, should be {@link SeaTunnelDataType##toString}. * @return {@link SeaTunnelDataType} instance. */ public static SeaTunnelDataType deserializeSeaTunnelDataType( String field, String columnType) { SqlType sqlType = null; try { String compatible = compatibleTypeDeclare(columnType); sqlType = SqlType.valueOf(compatible.toUpperCase().replace(" ", "")); } catch (IllegalArgumentException e) { // nothing } if (sqlType == null) { return parseComplexDataType(field, columnType); } switch (sqlType) { case STRING: return BasicType.STRING_TYPE; case BOOLEAN: return BasicType.BOOLEAN_TYPE; case TINYINT: return BasicType.BYTE_TYPE; case BYTES: return PrimitiveByteArrayType.INSTANCE; case SMALLINT: return BasicType.SHORT_TYPE; case INT: return BasicType.INT_TYPE; case BIGINT: return BasicType.LONG_TYPE; case FLOAT: return BasicType.FLOAT_TYPE; case DOUBLE: return BasicType.DOUBLE_TYPE; case NULL: return BasicType.VOID_TYPE; case DATE: return LocalTimeType.LOCAL_DATE_TYPE; case TIME: return LocalTimeType.LOCAL_TIME_TYPE; case TIMESTAMP: return LocalTimeType.LOCAL_DATE_TIME_TYPE; case TIMESTAMP_TZ: return LocalTimeType.OFFSET_DATE_TIME_TYPE; case MAP: return parseMapType(field, columnType); case BINARY_VECTOR: return VectorType.VECTOR_BINARY_TYPE; case FLOAT_VECTOR: return VectorType.VECTOR_FLOAT_TYPE; case FLOAT16_VECTOR: return VectorType.VECTOR_FLOAT16_TYPE; case BFLOAT16_VECTOR: return VectorType.VECTOR_BFLOAT16_TYPE; case SPARSE_FLOAT_VECTOR: return VectorType.VECTOR_SPARSE_FLOAT_TYPE; default: throw CommonError.unsupportedDataType("SeaTunnel", columnType, field); } } /** * User-facing data type declarations will adhere to the specifications outlined in * schema-feature.md. To maintain backward compatibility, this function will transform type * declarations into standard form, including: long -> bigint, * short -> smallint, and byte -> tinyint. * *

    In a future version, user-facing data type declarations will strictly follow the * specifications, and this function will be removed. * * @param declare * @return compatible type */ @Deprecated private static String compatibleTypeDeclare(String declare) { switch (declare.trim().toUpperCase()) { case "LONG": return "BIGINT"; case "SHORT": return "SMALLINT"; case "BYTE": return "TINYINT"; default: return declare; } } private static SeaTunnelDataType parseComplexDataType(String field, String columnStr) { String column = columnStr.toUpperCase().replace(" ", ""); if (column.startsWith(SqlType.MAP.name())) { return parseMapType(field, columnStr); } if (column.startsWith(SqlType.ARRAY.name())) { return parseArrayType(field, columnStr); } if (column.startsWith(SqlType.DECIMAL.name())) { return parseDecimalType(columnStr); } if (column.trim().startsWith("{")) { return parseRowType(columnStr); } throw CommonError.unsupportedDataType("SeaTunnel", columnStr, field); } private static SeaTunnelDataType parseRowType(String columnStr) { String confPayload = "{conf = " + columnStr + "}"; Config conf; try { conf = ConfigFactory.parseString(confPayload); } catch (RuntimeException e) { throw new IllegalArgumentException( String.format("HOCON Config parse from %s failed.", confPayload), e); } return parseRowType(conf.getObject("conf")); } private static SeaTunnelDataType parseRowType(ConfigObject conf) { String[] fieldNames = new String[conf.size()]; SeaTunnelDataType[] fieldTypes = new SeaTunnelDataType[conf.size()]; conf.keySet().toArray(fieldNames); for (int idx = 0; idx < fieldNames.length; idx++) { String fieldName = fieldNames[idx]; ConfigValue typeVal = conf.get(fieldName); switch (typeVal.valueType()) { case STRING: { fieldTypes[idx] = deserializeSeaTunnelDataType( fieldNames[idx], (String) typeVal.unwrapped()); } break; case OBJECT: { fieldTypes[idx] = parseRowType((ConfigObject) typeVal); } break; case LIST: case NUMBER: case BOOLEAN: case NULL: default: throw new IllegalArgumentException( String.format( "Unsupported parse SeaTunnel Type from '%s'.", typeVal.unwrapped())); } } return new SeaTunnelRowType(fieldNames, fieldTypes); } private static SeaTunnelDataType parseMapType(String field, String columnStr) { String genericType = getGenericType(columnStr).trim(); int index = genericType.toUpperCase().startsWith(SqlType.DECIMAL.name()) ? // if map key is decimal, we should find the index of second ',' genericType.indexOf(",", genericType.indexOf(",") + 1) : // if map key is not decimal, we should find the index of first ',' genericType.indexOf(","); String keyGenericType = genericType.substring(0, index).trim(); String valueGenericType = genericType.substring(index + 1).trim(); return new MapType<>( deserializeSeaTunnelDataType(field, keyGenericType), deserializeSeaTunnelDataType(field, valueGenericType)); } private static String getGenericType(String columnStr) { // get the content between '<' and '>' return columnStr.substring(columnStr.indexOf("<") + 1, columnStr.lastIndexOf(">")); } private static SeaTunnelDataType parseArrayType(String field, String columnStr) { String genericType = getGenericType(columnStr).trim(); SeaTunnelDataType dataType = deserializeSeaTunnelDataType(field, genericType); switch (dataType.getSqlType()) { case STRING: return ArrayType.STRING_ARRAY_TYPE; case BOOLEAN: return ArrayType.BOOLEAN_ARRAY_TYPE; case TINYINT: return ArrayType.BYTE_ARRAY_TYPE; case SMALLINT: return ArrayType.SHORT_ARRAY_TYPE; case INT: return ArrayType.INT_ARRAY_TYPE; case BIGINT: return ArrayType.LONG_ARRAY_TYPE; case FLOAT: return ArrayType.FLOAT_ARRAY_TYPE; case DOUBLE: return ArrayType.DOUBLE_ARRAY_TYPE; case MAP: MapType mapType = (MapType) dataType; return new ArrayType<>(MapType.class, mapType); default: throw CommonError.unsupportedDataType("SeaTunnel", genericType, field); } } private static SeaTunnelDataType parseDecimalType(String columnStr) { String[] decimalInfos = columnStr.split(","); if (decimalInfos.length < 2) { throw new RuntimeException( "Decimal type should assign precision and scale information"); } int precision = Integer.parseInt(decimalInfos[0].replaceAll("\\D", "")); int scale = Integer.parseInt(decimalInfos[1].replaceAll("\\D", "")); return new DecimalType(precision, scale); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TableIdentifier.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.NonNull; import java.io.Serializable; @Getter @EqualsAndHashCode public final class TableIdentifier implements Serializable { private static final long serialVersionUID = 1L; private final String catalogName; private final String databaseName; private final String schemaName; @NonNull private final String tableName; public TableIdentifier( String catalogName, String databaseName, String schemaName, @NonNull String tableName) { this.catalogName = catalogName; this.databaseName = databaseName; this.schemaName = schemaName; this.tableName = tableName; if (StringUtils.isEmpty(tableName)) { throw new IllegalArgumentException("tableName cannot be empty"); } } public static TableIdentifier of(String catalogName, String databaseName, String tableName) { return new TableIdentifier(catalogName, databaseName, null, tableName); } public static TableIdentifier of(String catalogName, TablePath tablePath) { return new TableIdentifier( catalogName, tablePath.getDatabaseName(), tablePath.getSchemaName(), tablePath.getTableName()); } public static TableIdentifier of( String catalogName, String databaseName, String schemaName, String tableName) { return new TableIdentifier(catalogName, databaseName, schemaName, tableName); } public TablePath toTablePath() { return TablePath.of(databaseName, schemaName, tableName); } public TableIdentifier copy() { return TableIdentifier.of(catalogName, databaseName, schemaName, tableName); } @Override public String toString() { if (schemaName == null) { return String.join(".", catalogName, databaseName, tableName); } return String.join(".", catalogName, databaseName, schemaName, tableName); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.NonNull; import java.io.Serializable; import java.util.ArrayList; import java.util.List; @Getter @EqualsAndHashCode public final class TablePath implements Serializable { private static final long serialVersionUID = 1L; private final String databaseName; private final String schemaName; @NonNull private final String tableName; public TablePath(String databaseName, String schemaName, @NonNull String tableName) { this.databaseName = databaseName; this.schemaName = schemaName; this.tableName = tableName; if (StringUtils.isEmpty(tableName)) { throw new IllegalArgumentException("tableName cannot be empty"); } } public static final TablePath DEFAULT = TablePath.of("default", "default", "default"); public static TablePath of(String fullName) { return of(fullName, false); } public static TablePath of(String fullName, boolean schemaFirst) { String[] paths = fullName.split("\\."); if (paths.length == 1) { return of(null, paths[0]); } if (paths.length == 2) { if (schemaFirst) { return of(null, paths[0], paths[1]); } return of(paths[0], null, paths[1]); } if (paths.length == 3) { return of(paths[0], paths[1], paths[2]); } throw new IllegalArgumentException( String.format("Cannot get split '%s' to get databaseName and tableName", fullName)); } public static TablePath of(String databaseName, String tableName) { return of(databaseName, null, tableName); } public static TablePath of(String databaseName, String schemaName, String tableName) { return new TablePath(databaseName, schemaName, tableName); } public String getSchemaAndTableName() { return getNameCommon(null, schemaName, tableName, null, null); } public String getSchemaAndTableName(String quote) { return getNameCommon(null, schemaName, tableName, quote, quote); } public String getFullName() { return getNameCommon(databaseName, schemaName, tableName, null, null); } public String getFullNameWithQuoted() { return getFullNameWithQuoted("`"); } public String getFullNameWithQuoted(String quote) { return getNameCommon(databaseName, schemaName, tableName, quote, quote); } public String getFullNameWithQuoted(String quoteLeft, String quoteRight) { return getNameCommon(databaseName, schemaName, tableName, quoteLeft, quoteRight); } private String getNameCommon( String databaseName, String schemaName, String tableName, String quoteLeft, String quoteRight) { List joinList = new ArrayList<>(); quoteLeft = quoteLeft == null ? "" : quoteLeft; quoteRight = quoteRight == null ? "" : quoteRight; if (databaseName != null) { joinList.add(quoteLeft + databaseName + quoteRight); } if (schemaName != null) { joinList.add(quoteLeft + schemaName + quoteRight); } if (tableName != null) { joinList.add(quoteLeft + tableName + quoteRight); } return String.join(".", joinList); } @Override public String toString() { return getFullName(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TableSchema.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import lombok.Data; import lombok.EqualsAndHashCode; import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; /** Represent a physical table schema. */ @EqualsAndHashCode(callSuper = true) @Data public final class TableSchema extends AbstractSchema { private static final long serialVersionUID = 1L; private final PrimaryKey primaryKey; private final List constraintKeys; public TableSchema( List columns, PrimaryKey primaryKey, List constraintKeys) { super(columns); this.primaryKey = primaryKey; this.constraintKeys = constraintKeys; } public static Builder builder() { return new Builder(); } public static final class Builder { private final List columns = new ArrayList<>(); private PrimaryKey primaryKey; private final List constraintKeys = new ArrayList<>(); public Builder columns(List columns) { this.columns.addAll(columns); return this; } public Builder column(Column column) { this.columns.add(column); return this; } public Builder primaryKey(PrimaryKey primaryKey) { this.primaryKey = primaryKey; return this; } public Builder constraintKey(ConstraintKey constraintKey) { this.constraintKeys.add(constraintKey); return this; } public Builder constraintKey(List constraintKeys) { this.constraintKeys.addAll(constraintKeys); return this; } public TableSchema build() { return new TableSchema(columns, primaryKey, constraintKeys); } } public TableSchema copy() { List copyColumns = columns.stream().map(Column::copy).collect(Collectors.toList()); List copyConstraintKeys = constraintKeys.stream().map(ConstraintKey::copy).collect(Collectors.toList()); return TableSchema.builder() .constraintKey(copyConstraintKeys) .columns(copyColumns) .primaryKey(primaryKey == null ? null : primaryKey.copy()) .build(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/VectorIndex.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import lombok.EqualsAndHashCode; import lombok.Getter; import java.io.Serializable; /** Vector Database need special Index on its vector field. */ @EqualsAndHashCode(callSuper = true) @Getter public class VectorIndex extends ConstraintKey.ConstraintKeyColumn implements Serializable { /** Vector index name */ private final String indexName; /** Vector indexType, such as IVF_FLAT, HNSW, DISKANN */ private final IndexType indexType; /** Vector index metricType, such as L2, IP, COSINE */ private final MetricType metricType; public VectorIndex(String indexName, String columnName, String indexType, String metricType) { super(columnName, null); this.indexName = indexName; this.indexType = IndexType.of(indexType); this.metricType = MetricType.of(metricType); } public VectorIndex( String indexName, String columnName, IndexType indexType, MetricType metricType) { super(columnName, null); this.indexName = indexName; this.indexType = indexType; this.metricType = metricType; } @Override public ConstraintKey.ConstraintKeyColumn copy() { return new VectorIndex(indexName, getColumnName(), indexType, metricType); } public enum IndexType { FLAT, IVF_FLAT, IVF_SQ8, IVF_PQ, HNSW, DISKANN, AUTOINDEX, SCANN, // GPU indexes only for float vectors GPU_IVF_FLAT, GPU_IVF_PQ, GPU_BRUTE_FORCE, GPU_CAGRA, // Only supported for binary vectors BIN_FLAT, BIN_IVF_FLAT, // Only for varchar type field TRIE, // Only for scalar type field STL_SORT, // only for numeric type field INVERTED, // works for all scalar fields except JSON type field // Only for sparse vectors SPARSE_INVERTED_INDEX, SPARSE_WAND, ; public static IndexType of(String name) { return valueOf(name.toUpperCase()); } } public enum MetricType { // Only for float vectors L2, IP, COSINE, // Only for binary vectors HAMMING, JACCARD, ; public static MetricType of(String name) { return valueOf(name.toUpperCase()); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/exception/CatalogException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.exception; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; /** A catalog-related, runtime exception. */ public class CatalogException extends SeaTunnelRuntimeException { /** @param message the detail message. */ public CatalogException(String message) { super(SeaTunnelAPIErrorCode.CATALOG_INITIALIZE_FAILED, message); } /** @param cause the cause. */ public CatalogException(Throwable cause) { super(SeaTunnelAPIErrorCode.CATALOG_INITIALIZE_FAILED, cause); } /** * @param message the detail message. * @param cause the cause. */ public CatalogException(String message, Throwable cause) { super(SeaTunnelAPIErrorCode.CATALOG_INITIALIZE_FAILED, message, cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/exception/DatabaseAlreadyExistException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.exception; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; public class DatabaseAlreadyExistException extends SeaTunnelRuntimeException { private static final String MSG = "Database %s already exist in Catalog %s."; public DatabaseAlreadyExistException(String catalogName, String databaseName) { this(catalogName, databaseName, null); } public DatabaseAlreadyExistException(String catalogName, String databaseName, Throwable cause) { super( SeaTunnelAPIErrorCode.DATABASE_ALREADY_EXISTED, String.format(MSG, databaseName, catalogName), cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/exception/DatabaseNotExistException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.exception; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; /** Exception for trying to operate on a database that doesn't exist. */ public class DatabaseNotExistException extends SeaTunnelRuntimeException { private static final String MSG = "Database %s does not exist in Catalog %s."; public DatabaseNotExistException(String catalogName, String databaseName, Throwable cause) { super( SeaTunnelAPIErrorCode.DATABASE_NOT_EXISTED, String.format(MSG, databaseName, catalogName), cause); } public DatabaseNotExistException(String catalogName, String databaseName) { this(catalogName, databaseName, null); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/exception/TableAlreadyExistException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.exception; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; public class TableAlreadyExistException extends SeaTunnelRuntimeException { private static final String MSG = "Table %s already exist in Catalog %s."; public TableAlreadyExistException(String catalogName, TablePath tablePath) { this(catalogName, tablePath, null); } public TableAlreadyExistException(String catalogName, TablePath tablePath, Throwable cause) { super( SeaTunnelAPIErrorCode.TABLE_ALREADY_EXISTED, String.format(MSG, tablePath.getFullName(), catalogName), cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/exception/TableNotExistException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.exception; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; /** Exception for trying to operate on a table that doesn't exist. */ public class TableNotExistException extends SeaTunnelRuntimeException { private static final String MSG = "Table %s does not exist in Catalog %s."; public TableNotExistException(String catalogName, TablePath tablePath) { this(catalogName, tablePath, null); } public TableNotExistException(String catalogName, TablePath tablePath, Throwable cause) { super( SeaTunnelAPIErrorCode.TABLE_NOT_EXISTED, String.format(MSG, tablePath.getFullName(), catalogName), cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/schema/ReadonlyConfigParser.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.schema; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.SeaTunnelDataTypeConvertorUtil; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.common.utils.JsonUtils; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; public class ReadonlyConfigParser implements TableSchemaParser { private final TableSchemaParser.ColumnParser columnParser = new ColumnParser(); private final TableSchemaParser.FieldParser fieldParser = new FieldParser(); private final TableSchemaParser.ConstraintKeyParser constraintKeyParser = new ConstraintKeyParser(); private final TableSchemaParser.PrimaryKeyParser primaryKeyParser = new PrimaryKeyParser(); @Override public TableSchema parse(ReadonlyConfig readonlyConfig) { ReadonlyConfig schemaConfig = readonlyConfig .getOptional(ConnectorCommonOptions.SCHEMA) .map(ReadonlyConfig::fromMap) .orElseThrow( () -> new IllegalArgumentException("Schema config can't be null")); if (readonlyConfig.getOptional(ConnectorCommonOptions.FIELDS).isPresent() && schemaConfig.getOptional(ConnectorCommonOptions.COLUMNS).isPresent()) { throw new IllegalArgumentException( "Schema config can't contains both [fields] and [columns], please correct your config first"); } TableSchema.Builder tableSchemaBuilder = TableSchema.builder(); if (readonlyConfig.getOptional(ConnectorCommonOptions.FIELDS).isPresent()) { // we use readonlyConfig here to avoid flatten, this is used to solve the t.x.x as field // key tableSchemaBuilder.columns(fieldParser.parse(readonlyConfig)); } if (schemaConfig.getOptional(ConnectorCommonOptions.COLUMNS).isPresent()) { tableSchemaBuilder.columns(columnParser.parse(schemaConfig)); } if (schemaConfig.getOptional(ConnectorCommonOptions.PRIMARY_KEY).isPresent()) { tableSchemaBuilder.primaryKey(primaryKeyParser.parse(schemaConfig)); } if (schemaConfig.getOptional(ConnectorCommonOptions.CONSTRAINT_KEYS).isPresent()) { tableSchemaBuilder.constraintKey(constraintKeyParser.parse(schemaConfig)); } // todo: validate schema return tableSchemaBuilder.build(); } private static class FieldParser implements TableSchemaParser.FieldParser { @Override public List parse(ReadonlyConfig schemaConfig) { JsonNode jsonNode = JsonUtils.toJsonNode(schemaConfig.get(ConnectorCommonOptions.FIELDS)); Map fieldsMap = JsonUtils.toStringMap(jsonNode); int fieldsNum = fieldsMap.size(); List columns = new ArrayList<>(fieldsNum); for (Map.Entry entry : fieldsMap.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); SeaTunnelDataType dataType = SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType(key, value); PhysicalColumn column = PhysicalColumn.of(key, dataType, null, null, true, null, null); columns.add(column); } return columns; } } private static class ColumnParser implements TableSchemaParser.ColumnParser { @Override public List parse(ReadonlyConfig schemaConfig) { return schemaConfig.get(ConnectorCommonOptions.COLUMNS).stream() .map(ReadonlyConfig::fromMap) .map( columnConfig -> { String name = columnConfig .getOptional(ConnectorCommonOptions.COLUMN_NAME) .orElseThrow( () -> new IllegalArgumentException( "schema.columns.* config need option [name], please correct your config first")); SeaTunnelDataType seaTunnelDataType = columnConfig .getOptional(ConnectorCommonOptions.TYPE) .map( column -> SeaTunnelDataTypeConvertorUtil .deserializeSeaTunnelDataType( name, column)) .orElseThrow( () -> new IllegalArgumentException( "schema.columns.* config need option [type], please correct your config first")); Long columnLength = columnConfig.get(ConnectorCommonOptions.COLUMN_LENGTH); Integer columnScale = columnConfig.get(ConnectorCommonOptions.COLUMN_SCALE); Boolean nullable = columnConfig.get(ConnectorCommonOptions.NULLABLE); Object defaultValue = columnConfig.get(ConnectorCommonOptions.DEFAULT_VALUE); String comment = columnConfig.get(ConnectorCommonOptions.COLUMN_COMMENT); return PhysicalColumn.of( name, seaTunnelDataType, columnLength, columnScale, nullable, defaultValue, comment); }) .collect(Collectors.toList()); } } private static class ConstraintKeyParser implements TableSchemaParser.ConstraintKeyParser { @Override public List parse(ReadonlyConfig schemaConfig) { return schemaConfig.get(ConnectorCommonOptions.CONSTRAINT_KEYS).stream() .map(ReadonlyConfig::fromMap) .map( constraintKeyConfig -> { String constraintName = constraintKeyConfig .getOptional( ConnectorCommonOptions.CONSTRAINT_KEY_NAME) .orElseThrow( () -> new IllegalArgumentException( "schema.constraintKeys.* config need option [constraintName], please correct your config first")); ConstraintKey.ConstraintType constraintType = constraintKeyConfig .getOptional( ConnectorCommonOptions.CONSTRAINT_KEY_TYPE) .orElseThrow( () -> new IllegalArgumentException( "schema.constraintKeys.* config need option [constraintType], please correct your config first")); List columns = constraintKeyConfig .getOptional( ConnectorCommonOptions .CONSTRAINT_KEY_COLUMNS) .map( constraintColumnMapList -> constraintColumnMapList.stream() .map( ReadonlyConfig ::fromMap) .map( constraintColumnConfig -> { String columnName = constraintColumnConfig .getOptional( ConnectorCommonOptions .CONSTRAINT_KEY_COLUMN_NAME) .orElseThrow( () -> new IllegalArgumentException( "schema.constraintKeys.constraintColumns.* config need option [columnName], please correct your config first")); ConstraintKey .ColumnSortType columnSortType = constraintColumnConfig .get( ConnectorCommonOptions .CONSTRAINT_KEY_COLUMN_SORT_TYPE); return ConstraintKey .ConstraintKeyColumn .of( columnName, columnSortType); }) .collect( Collectors .toList())) .orElseThrow( () -> new IllegalArgumentException( "schema.constraintKeys.* config need option [columns], please correct your config first")); return ConstraintKey.of(constraintType, constraintName, columns); }) .collect(Collectors.toList()); } } private static class PrimaryKeyParser implements TableSchemaParser.PrimaryKeyParser { @Override public PrimaryKey parse(ReadonlyConfig schemaConfig) { ReadonlyConfig primaryKeyConfig = ReadonlyConfig.fromMap(schemaConfig.get(ConnectorCommonOptions.PRIMARY_KEY)); String primaryKeyName = primaryKeyConfig .getOptional(ConnectorCommonOptions.PRIMARY_KEY_NAME) .orElseThrow( () -> new IllegalArgumentException( "Schema config need option [primaryKey.name], please correct your config first")); List columns = primaryKeyConfig .getOptional(ConnectorCommonOptions.PRIMARY_KEY_COLUMNS) .orElseThrow( () -> new IllegalArgumentException( "Schema config need option [primaryKey.columnNames], please correct your config first")); return new PrimaryKey(primaryKeyName, columns); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/schema/TableSchemaParser.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.schema; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableSchema; import java.util.List; public interface TableSchemaParser { /** * Parse schema config to TableSchema * * @param schemaConfig schema config * @return TableSchema */ TableSchema parse(T schemaConfig); @Deprecated interface FieldParser { /** * Parse field config to List * * @param schemaConfig schema config * @return List column list */ List parse(T schemaConfig); } interface ColumnParser { /** * Parse column config to List * * @param schemaConfig schema config * @return List column list */ List parse(T schemaConfig); } interface ConstraintKeyParser { /** * Parse constraint key config to ConstraintKey * * @param schemaConfig schema config * @return List constraint key list */ List parse(T schemaConfig); } interface PrimaryKeyParser { /** * Parse primary key config to PrimaryKey * * @param schemaConfig schema config * @return PrimaryKey */ PrimaryKey parse(T schemaConfig); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/DeserializationFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.serialization.DeserializationSchema; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.util.Collections; import java.util.List; import java.util.Map; public interface DeserializationFormat { DeserializationSchema createDeserializationSchema(); default Map> listReadableMetadata() { return Collections.emptyMap(); } default void applyReadableMetadata(List metadataKeys, SeaTunnelDataType dataType) { throw new UnsupportedOperationException( "A decoding format must override this method to apply metadata keys."); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/SerializationFormat.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.serialization.SerializationSchema; public interface SerializationFormat { SerializationSchema createSerializationSchema(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/SupportReadingMetadata.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.util.List; import java.util.Map; /** Used for {@link TableSource} to support metadata columns. */ public interface SupportReadingMetadata { Map> listReadableMetadata(CatalogTable catalogTable); void applyReadableMetadata( CatalogTable catalogTable, List metadataKeys, SeaTunnelDataType dataType); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/TableSink.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.sink.SeaTunnelSink; public interface TableSink { SeaTunnelSink createSink(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/TableSource.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceSplit; import java.io.Serializable; /** Used to support authentication and processing of {@link SupportReadingMetadata} */ public interface TableSource { SeaTunnelSource createSource(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/connector/TableTransform.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.connector; import org.apache.seatunnel.api.transform.SeaTunnelTransform; public interface TableTransform { SeaTunnelTransform createTransform(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/BasicDataConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import java.math.BigDecimal; import java.nio.ByteBuffer; import java.sql.Time; import java.time.Duration; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneId; import java.util.Collection; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; public interface BasicDataConverter extends DataConverter { @Override default Object convert(SeaTunnelDataType typeDefine, Object value) { if (value == null) { return null; } switch (typeDefine.getSqlType()) { case NULL: return null; case BOOLEAN: return convertBoolean(value); case TINYINT: return convertByte(value); case SMALLINT: return convertShort(value); case INT: return convertInt(value); case BIGINT: return convertLong(value); case FLOAT: return convertFloat(value); case DOUBLE: return convertDouble(value); case DECIMAL: return convertDecimal(value); case DATE: return convertLocalDate(value); case TIME: return convertTime(value); case TIMESTAMP: return convertLocalDateTime(value); case TIMESTAMP_TZ: return convertOffsetDateTime(value); case BYTES: return convertBytes(value); case STRING: return convertString(value); case ROW: return convertRow((SeaTunnelRowType) typeDefine, value); case ARRAY: return convertArray((ArrayType) typeDefine, value); case MAP: return convertMap((MapType) typeDefine, value); default: throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to " + typeDefine.getSqlType()); } } @Override default Object convert(T typeDefine, Column columnDefine, Object value) { if (value == null) { return null; } switch (columnDefine.getDataType().getSqlType()) { case NULL: return null; case BOOLEAN: return convertBoolean(typeDefine, value); case TINYINT: return convertByte(typeDefine, value); case SMALLINT: return convertShort(typeDefine, value); case INT: return convertInt(typeDefine, value); case BIGINT: return convertLong(typeDefine, value); case FLOAT: return convertFloat(typeDefine, value); case DOUBLE: return convertDouble(typeDefine, value); case DECIMAL: return convertDecimal(typeDefine, value); case DATE: return convertLocalDate(typeDefine, value); case TIME: return convertTime(typeDefine, value); case TIMESTAMP: return convertLocalDateTime(typeDefine, value); case TIMESTAMP_TZ: return convertOffsetDateTime(typeDefine, value); case BYTES: return convertBytes(typeDefine, value); case STRING: return convertString(typeDefine, value); case ROW: return convertRow(typeDefine, columnDefine, value); case ARRAY: return convertArray(typeDefine, columnDefine, value); case MAP: return convertMap(typeDefine, columnDefine, value); default: throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to " + columnDefine.getDataType().getSqlType()); } } default Map convertMap(T typeDefine, Column columnDefine, Object value) throws UnsupportedOperationException { return convertMap((MapType) columnDefine.getDataType(), value); } default Map convertMap(MapType typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Map) { return (Map) value; } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Map, typeDefine: " + typeDefine); } default Object[] convertArray(T typeDefine, Column columnDefine, Object value) throws UnsupportedOperationException { return convertArray((ArrayType) columnDefine.getDataType(), value); } default Object[] convertArray(ArrayType typeDefine, Object value) throws UnsupportedOperationException { if (value.getClass().isArray()) { SeaTunnelDataType elementType = typeDefine.getElementType(); Object[] array = (Object[]) value; for (int i = 0; i < array.length; i++) { array[i] = convert(elementType, array[i]); } return array; } if (value instanceof List) { SeaTunnelDataType elementType = typeDefine.getElementType(); List list = (List) value; int elements = list.size(); for (int i = 0; i < elements; i++) { list.set(i, convert(elementType, list.get(i))); } return list.toArray(); } if (value instanceof Set) { SeaTunnelDataType elementType = typeDefine.getElementType(); return ((Set) value).stream().map(e -> convert(elementType, e)).toArray(); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Array, typeDefine: " + typeDefine); } default SeaTunnelRow convertRow(T typeDefine, Column columnDefine, Object value) throws UnsupportedOperationException { return convertRow((SeaTunnelRowType) columnDefine.getDataType(), value); } default SeaTunnelRow convertRow(SeaTunnelRowType typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof SeaTunnelRow) { return (SeaTunnelRow) value; } if (value instanceof Collection) { Collection collection = (Collection) value; if (collection.size() != typeDefine.getTotalFields()) { throw new IllegalArgumentException( "The size of collection is not equal to the size of row type"); } Object[] array = new Object[collection.size()]; int i = 0; for (Iterator iterator = collection.iterator(); iterator.hasNext(); i++) { Object object = iterator.next(); SeaTunnelDataType type = typeDefine.getFieldType(i); array[i] = convert(type, object); } return new SeaTunnelRow(array); } if (value instanceof Map) { Map map = (Map) value; Object[] array = new Object[typeDefine.getTotalFields()]; for (int i = 0; i < typeDefine.getTotalFields(); i++) { String key = typeDefine.getFieldName(i); SeaTunnelDataType type = typeDefine.getFieldType(i); Object object = map.get(key); array[i] = convert(type, object); } return new SeaTunnelRow(array); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Row, typeDefine: " + typeDefine); } default String convertString(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof String) { return (String) value; } if (value instanceof Number) { return convertString(typeDefine, (Number) value); } if (value instanceof byte[]) { return convertString(typeDefine, (byte[]) value); } if (value instanceof Boolean) { return convertString(typeDefine, (boolean) value); } if (value instanceof Date) { return convertString(typeDefine, (Date) value); } if (value instanceof LocalDate) { return convertString(typeDefine, (LocalDate) value); } if (value instanceof LocalTime) { return convertString(typeDefine, (LocalTime) value); } if (value instanceof LocalDateTime) { return convertString(typeDefine, (LocalDateTime) value); } return value.toString(); } default String convertString(T typeDefine, Number value) { return convertString(value); } default String convertString(T typeDefine, byte[] value) { return convertString(value); } default String convertString(T typeDefine, boolean value) { return convertString(value); } default String convertString(T typeDefine, Date value) { return convertString(value); } default String convertString(T typeDefine, LocalDate value) { return convertString(value); } default String convertString(T typeDefine, Time value) { return convertString(value); } default String convertString(T typeDefine, LocalTime value) { return convertString(value); } default String convertString(T typeDefine, LocalDateTime value) { return convertString(value); } default String convertString(Object value) throws UnsupportedOperationException { if (value instanceof String) { return (String) value; } if (value instanceof Number) { return convertString((Number) value); } if (value instanceof byte[]) { return convertString((byte[]) value); } if (value instanceof Boolean) { return convertString((boolean) value); } if (value instanceof Date) { return convertString((Date) value); } if (value instanceof LocalDate) { return convertString((LocalDate) value); } if (value instanceof LocalTime) { return convertString((LocalTime) value); } if (value instanceof LocalDateTime) { return convertString((LocalDateTime) value); } return value.toString(); } default String convertString(Number value) { return String.valueOf(value); } default String convertString(byte[] value) { return new String(value); } default String convertString(boolean value) { return value ? "true" : "false"; } default String convertString(Date value) { return value.toString(); } default String convertString(LocalDate value) { return value.toString(); } default String convertString(Time value) { return value.toString(); } default String convertString(LocalTime value) { return value.toString(); } default String convertString(LocalDateTime value) { return value.toString(); } default byte[] convertBytes(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof byte[]) { return (byte[]) value; } if (value instanceof ByteBuffer) { return convertBytes((ByteBuffer) value); } if (value instanceof String) { return convertBytes(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to byte[], typeDefine: " + typeDefine); } default byte[] convertBytes(T typeDefine, String value) { return convertBytes(value); } default byte[] convertBytes(Object value) throws UnsupportedOperationException { if (value instanceof byte[]) { return (byte[]) value; } if (value instanceof ByteBuffer) { return convertBytes((ByteBuffer) value); } if (value instanceof String) { return convertBytes((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to byte[]"); } default byte[] convertBytes(ByteBuffer value) { byte[] bytes = new byte[value.remaining()]; value.get(bytes); return bytes; } default byte[] convertBytes(String value) { return value.getBytes(); } default LocalDateTime convertLocalDateTime(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof LocalDateTime) { return (LocalDateTime) value; } if (value instanceof OffsetDateTime) { return ((OffsetDateTime) value).toLocalDateTime(); } if (value instanceof Instant) { return convertLocalDateTime(typeDefine, (Instant) value); } if (value instanceof Date) { return convertLocalDateTime(typeDefine, (Date) value); } if (value instanceof LocalDate) { return convertLocalDateTime((LocalDate) value); } if (value instanceof java.sql.Date) { return convertLocalDateTime((java.sql.Date) value); } if (value instanceof java.sql.Timestamp) { return convertLocalDateTime((java.sql.Timestamp) value); } if (value instanceof String) { return convertLocalDateTime(typeDefine, (String) value); } if (value instanceof Number) { return convertLocalDateTime(typeDefine, (Number) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalDateTime, typeDefine: " + typeDefine); } default OffsetDateTime convertOffsetDateTime(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof OffsetDateTime) { return (OffsetDateTime) value; } if (value instanceof LocalDateTime) { return ((LocalDateTime) value).atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof Instant) { return ((Instant) value).atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof java.sql.Date) { return ((java.sql.Date) value) .toLocalDate() .atTime(LocalTime.MIDNIGHT) .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof java.sql.Timestamp) { return ((java.sql.Timestamp) value) .toInstant() .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof Date) { return ((Date) value).toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof LocalDate) { return ((LocalDate) value) .atTime(LocalTime.MIDNIGHT) .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof String) { return OffsetDateTime.parse((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to OffsetDateTime, typeDefine: " + typeDefine); } default LocalDateTime convertLocalDateTime(T typeDefine, Instant value) { return convertLocalDateTime(value); } default LocalDateTime convertLocalDateTime(T typeDefine, Date value) { return convertLocalDateTime(value); } default LocalDateTime convertLocalDateTime(T typeDefine, String value) { return convertLocalDateTime(value); } default LocalDateTime convertLocalDateTime(T typeDefine, Number value) { return convertLocalDateTime(value); } default LocalDateTime convertLocalDateTime(Object value) throws UnsupportedOperationException { if (value instanceof LocalDateTime) { return (LocalDateTime) value; } if (value instanceof OffsetDateTime) { return ((OffsetDateTime) value).toLocalDateTime(); } if (value instanceof Instant) { return convertLocalDateTime((Instant) value); } if (value instanceof Date) { return convertLocalDateTime((Date) value); } if (value instanceof LocalDate) { return convertLocalDateTime((LocalDate) value); } if (value instanceof java.sql.Date) { return convertLocalDateTime((java.sql.Date) value); } if (value instanceof java.sql.Timestamp) { return convertLocalDateTime((java.sql.Timestamp) value); } if (value instanceof String) { return convertLocalDateTime((String) value); } if (value instanceof Number) { return convertLocalDateTime((Number) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalDateTime"); } default OffsetDateTime convertOffsetDateTime(Object value) throws UnsupportedOperationException { if (value instanceof OffsetDateTime) { return (OffsetDateTime) value; } if (value instanceof LocalDateTime) { return ((LocalDateTime) value).atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof Instant) { return ((Instant) value).atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof java.sql.Date) { return ((java.sql.Date) value) .toLocalDate() .atTime(LocalTime.MIDNIGHT) .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof java.sql.Timestamp) { return ((java.sql.Timestamp) value) .toInstant() .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof Date) { return ((Date) value).toInstant().atZone(ZoneId.systemDefault()).toOffsetDateTime(); } if (value instanceof LocalDate) { return ((LocalDate) value) .atTime(LocalTime.MIDNIGHT) .atZone(ZoneId.systemDefault()) .toOffsetDateTime(); } if (value instanceof String) { return OffsetDateTime.parse((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalDateTime"); } default LocalDateTime convertLocalDateTime(Instant value) { return value.atZone(ZoneId.systemDefault()).toLocalDateTime(); } default LocalDateTime convertLocalDateTime(Date value) { return value.toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); } default LocalDateTime convertLocalDateTime(LocalDate value) { return LocalDateTime.of(value, LocalTime.MIDNIGHT); } default LocalDateTime convertLocalDateTime(java.sql.Date value) { LocalDate date = value.toLocalDate(); return LocalDateTime.of(date, LocalTime.MIDNIGHT); } default LocalDateTime convertLocalDateTime(java.sql.Timestamp value) { return LocalDateTime.of( value.getYear() + 1900, value.getMonth() + 1, value.getDate(), value.getHours(), value.getMinutes(), value.getSeconds(), value.getNanos()); } default LocalDateTime convertLocalDateTime(String value) { return LocalDateTime.parse(value); } default LocalDateTime convertLocalDateTime(Number value) { if (value.longValue() < 999999999) { return LocalDateTime.ofEpochSecond( value.longValue(), 0, ZoneId.systemDefault().getRules().getOffset(LocalDateTime.now())); } return new Date(value.longValue()) .toInstant() .atZone(ZoneId.systemDefault()) .toLocalDateTime(); } default LocalTime convertTime(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof LocalTime) { return (LocalTime) value; } if (value instanceof Date) { return convertLocalTime((Date) value); } if (value instanceof Time) { return convertLocalTime(typeDefine, (Time) value); } if (value instanceof LocalDateTime) { return convertLocalTime((LocalDateTime) value); } if (value instanceof java.sql.Timestamp) { return convertLocalTime((java.sql.Timestamp) value); } if (value instanceof String) { return convertLocalTime(typeDefine, (String) value); } if (value instanceof Number) { return convertLocalTime(typeDefine, (Number) value); } if (value instanceof Duration) { return convertLocalTime((Duration) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalTime, typeDefine: " + typeDefine); } default LocalTime convertLocalTime(T typeDefine, Time value) { return convertLocalTime(value); } default LocalTime convertLocalTime(T typeDefine, String value) { return convertLocalTime(value); } default LocalTime convertLocalTime(T typeDefine, Number value) { return convertLocalTime(value); } default LocalTime convertTime(Object value) throws UnsupportedOperationException { if (value instanceof LocalTime) { return (LocalTime) value; } if (value instanceof Date) { return convertLocalTime((Date) value); } if (value instanceof Time) { return convertLocalTime((Time) value); } if (value instanceof LocalDateTime) { return convertLocalTime((LocalDateTime) value); } if (value instanceof java.sql.Timestamp) { return convertLocalTime((java.sql.Timestamp) value); } if (value instanceof String) { return convertLocalTime((String) value); } if (value instanceof Number) { return convertLocalTime((Number) value); } if (value instanceof Duration) { return convertLocalTime((Duration) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalTime"); } default LocalTime convertLocalTime(LocalDateTime value) { return value.toLocalTime(); } default LocalTime convertLocalTime(Time value) { return value.toLocalTime(); } default LocalTime convertLocalTime(java.sql.Timestamp value) { return LocalTime.of( value.getHours(), value.getMinutes(), value.getSeconds(), value.getNanos()); } default LocalTime convertLocalTime(Date value) { long millis = (int) (value.getTime() % TimeUnit.SECONDS.toMillis(1)); int nanosOfSecond = (int) (millis * TimeUnit.MILLISECONDS.toNanos(1)); return LocalTime.of( value.getHours(), value.getMinutes(), value.getSeconds(), nanosOfSecond); } default LocalTime convertLocalTime(Duration value) { Long nanos = value.toNanos(); if (nanos >= 0 && nanos <= TimeUnit.DAYS.toNanos(1)) { return LocalTime.ofNanoOfDay(nanos); } else { throw new IllegalArgumentException( "Time values must use number of milliseconds greater than 0 and less than 86400000000000"); } } default LocalTime convertLocalTime(String value) { return LocalTime.parse(value); } default LocalTime convertLocalTime(Number value) { return LocalTime.ofSecondOfDay(value.longValue()); } default LocalDate convertLocalDate(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof LocalDate) { return (LocalDate) value; } if (value instanceof Date) { return convertLocalDate(typeDefine, (Date) value); } if (value instanceof LocalDateTime) { return ((LocalDateTime) value).toLocalDate(); } if (value instanceof java.sql.Date) { return ((java.sql.Date) value).toLocalDate(); } if (value instanceof String) { return convertLocalDate(typeDefine, (String) value); } if (value instanceof Number) { return convertLocalDate(typeDefine, (Number) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalDate, typeDefine: " + typeDefine); } default LocalDate convertLocalDate(T typeDefine, Date value) { return convertLocalDate(value); } default LocalDate convertLocalDate(T typeDefine, String value) { return convertLocalDate(value); } default LocalDate convertLocalDate(T typeDefine, Number value) { return convertLocalDate(value); } default LocalDate convertLocalDate(Object value) throws UnsupportedOperationException { if (value instanceof LocalDate) { return (LocalDate) value; } if (value instanceof Date) { return convertLocalDate((Date) value); } if (value instanceof LocalDateTime) { return ((LocalDateTime) value).toLocalDate(); } if (value instanceof java.sql.Date) { return ((java.sql.Date) value).toLocalDate(); } if (value instanceof String) { return convertLocalDate((String) value); } if (value instanceof Number) { return convertLocalDate((Number) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to LocalDate"); } default LocalDate convertLocalDate(Date value) { return value.toInstant().atZone(ZoneId.systemDefault()).toLocalDate(); } default LocalDate convertLocalDate(String value) { return LocalDate.parse(value); } default LocalDate convertLocalDate(Number value) { if (value.longValue() < 999999999) { return LocalDateTime.ofEpochSecond( value.longValue(), 0, ZoneId.systemDefault().getRules().getOffset(LocalDateTime.now())) .toLocalDate(); } return new Date(value.longValue()).toInstant().atZone(ZoneId.systemDefault()).toLocalDate(); } default BigDecimal convertDecimal(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof BigDecimal) { return (BigDecimal) value; } if (value instanceof Number) { return convertDecimal(typeDefine, (Number) value); } if (value instanceof String) { return convertDecimal(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to BigDecimal, typeDefine: " + typeDefine); } default BigDecimal convertDecimal(T typeDefine, Number value) { return convertDecimal(value); } default BigDecimal convertDecimal(T typeDefine, String value) { return convertDecimal(value); } default BigDecimal convertDecimal(Object value) throws UnsupportedOperationException { if (value instanceof BigDecimal) { return (BigDecimal) value; } if (value instanceof Number) { return convertDecimal((Number) value); } if (value instanceof String) { return convertDecimal((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to BigDecimal"); } default BigDecimal convertDecimal(Number value) { return new BigDecimal(value.doubleValue()); } default BigDecimal convertDecimal(String value) { return new BigDecimal(value); } default double convertDouble(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Double) { return (double) value; } if (value instanceof Number) { return convertDouble(typeDefine, (Number) value); } if (value instanceof String) { return convertDouble(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Double, typeDefine: " + typeDefine); } default double convertDouble(T typeDefine, Number value) { return convertDouble(value); } default double convertDouble(T typeDefine, String value) { return convertDouble(value); } default double convertDouble(Object value) throws UnsupportedOperationException { if (value instanceof Double) { return (double) value; } if (value instanceof Number) { return convertDouble((Number) value); } if (value instanceof String) { return convertDouble((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Double"); } default double convertDouble(Number value) { return value.doubleValue(); } default double convertDouble(String value) { return Double.parseDouble(value); } default float convertFloat(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Float) { return (float) value; } if (value instanceof Number) { return convertFloat(typeDefine, (Number) value); } if (value instanceof String) { return convertFloat(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Float, typeDefine: " + typeDefine); } default float convertFloat(T typeDefine, Number value) { return convertFloat(value); } default float convertFloat(T typeDefine, String value) { return convertFloat(value); } default float convertFloat(Object value) throws UnsupportedOperationException { if (value instanceof Float) { return (float) value; } if (value instanceof Number) { return convertFloat((Number) value); } if (value instanceof String) { return convertFloat((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Float"); } default float convertFloat(Number value) { return value.floatValue(); } default float convertFloat(String value) { return Float.parseFloat(value); } default long convertLong(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Long) { return (long) value; } if (value instanceof Number) { return convertLong(typeDefine, (Number) value); } if (value instanceof String) { return convertLong(typeDefine, (String) value); } if (value instanceof Time) { return convertLong(typeDefine, (Time) value); } if (value instanceof LocalTime) { return convertLong(typeDefine, (LocalTime) value); } if (value instanceof Date) { return convertLong(typeDefine, (Date) value); } if (value instanceof LocalDate) { return convertLong(typeDefine, (LocalDate) value); } if (value instanceof LocalDateTime) { return convertLong(typeDefine, (LocalDateTime) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Long, typeDefine: " + typeDefine); } default long convertLong(T typeDefine, Number value) { return convertLong(value); } default long convertLong(T typeDefine, String value) { return convertLong(value); } default long convertLong(T typeDefine, Time value) { return convertLong(value); } default long convertLong(T typeDefine, LocalTime value) { return convertLong(value); } default long convertLong(T typeDefine, Date value) { return convertLong(value); } default long convertLong(T typeDefine, LocalDate value) { return convertLong(value); } default long convertLong(T typeDefine, LocalDateTime value) { return convertLong(value); } default long convertLong(Object value) throws UnsupportedOperationException { if (value instanceof Long) { return (long) value; } if (value instanceof Number) { return convertLong((Number) value); } if (value instanceof String) { return convertLong((String) value); } if (value instanceof Time) { return convertLong((Time) value); } if (value instanceof LocalTime) { return convertLong((LocalTime) value); } if (value instanceof Date) { return convertLong((Date) value); } if (value instanceof LocalDate) { return convertLong((LocalDate) value); } if (value instanceof LocalDateTime) { return convertLong((LocalDateTime) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Long"); } default long convertLong(Number value) { return value.longValue(); } default long convertLong(String value) { return Long.parseLong(value); } default long convertLong(Time value) { return value.toLocalTime().toSecondOfDay(); } default long convertLong(LocalTime value) { return value.toSecondOfDay(); } default long convertLong(Date value) { return value.getTime(); } default long convertLong(LocalDate value) { return value.atStartOfDay(ZoneId.systemDefault()).toInstant().toEpochMilli(); } default long convertLong(LocalDateTime value) { return value.atZone(ZoneId.systemDefault()).toInstant().toEpochMilli(); } default int convertInt(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Integer) { return (int) value; } if (value instanceof Number) { return convertInt(typeDefine, (Number) value); } if (value instanceof String) { return convertInt(typeDefine, (String) value); } if (value instanceof Time) { return convertInt(typeDefine, (Time) value); } if (value instanceof LocalTime) { return convertInt(typeDefine, (LocalTime) value); } if (value instanceof Date) { return convertInt(typeDefine, (Date) value); } if (value instanceof LocalDate) { return convertInt(typeDefine, (LocalDate) value); } if (value instanceof LocalDateTime) { return convertInt(typeDefine, (LocalDateTime) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Integer, typeDefine: " + typeDefine); } default int convertInt(T typeDefine, Number value) { return convertInt(value); } default int convertInt(T typeDefine, String value) { return convertInt(value); } default int convertInt(T typeDefine, Time value) { return convertInt(value); } default int convertInt(T typeDefine, LocalTime value) { return convertInt(value); } default int convertInt(T typeDefine, Date value) { return convertInt(value); } default int convertInt(T typeDefine, LocalDate value) { return convertInt(value); } default int convertInt(T typeDefine, LocalDateTime value) { return convertInt(value); } default int convertInt(Object value) throws UnsupportedOperationException { if (value instanceof Integer) { return (int) value; } if (value instanceof Number) { return convertInt((Number) value); } if (value instanceof String) { return convertInt((String) value); } if (value instanceof Time) { return convertInt((Time) value); } if (value instanceof LocalTime) { return convertInt((LocalTime) value); } if (value instanceof Date) { return convertInt((Date) value); } if (value instanceof LocalDate) { return convertInt((LocalDate) value); } if (value instanceof LocalDateTime) { return convertInt((LocalDateTime) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Integer"); } default int convertInt(Number value) { return value.intValue(); } default int convertInt(String value) { return Integer.parseInt(value); } default int convertInt(Time value) { return value.toLocalTime().toSecondOfDay(); } default int convertInt(LocalTime value) { return value.toSecondOfDay(); } default int convertInt(Date value) { return (int) (value.getTime() / 1000); } default int convertInt(LocalDateTime value) { return (int) (value.atZone(ZoneId.systemDefault()).toInstant().toEpochMilli() / 1000); } default int convertInt(LocalDate value) { return (int) (value.atStartOfDay(ZoneId.systemDefault()).toInstant().toEpochMilli() / 1000); } default short convertShort(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Short) { return (short) value; } if (value instanceof Number) { return convertShort(typeDefine, (Number) value); } if (value instanceof String) { return convertShort(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Short, typeDefine: " + typeDefine); } default short convertShort(T typeDefine, Number value) { return convertShort(value); } default short convertShort(T typeDefine, String value) { return convertShort(value); } default short convertShort(Object value) throws UnsupportedOperationException { if (value instanceof Short) { return (short) value; } if (value instanceof Number) { return convertShort((Number) value); } if (value instanceof String) { return convertShort((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Short"); } default short convertShort(Number value) { return value.shortValue(); } default short convertShort(String value) { return Short.parseShort(value); } default byte convertByte(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Byte) { return (byte) value; } if (value instanceof Number) { return convertByte(typeDefine, (Number) value); } if (value instanceof String) { return convertByte(typeDefine, (String) value); } if (value instanceof Boolean) { return convertByte(typeDefine, ((boolean) value)); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Byte, typeDefine: " + typeDefine); } default byte convertByte(T typeDefine, Number value) { return convertByte(value); } default byte convertByte(T typeDefine, String value) { return convertByte(value); } default byte convertByte(T typeDefine, boolean value) { return convertByte(value); } default byte convertByte(Object value) throws UnsupportedOperationException { if (value instanceof Byte) { return (byte) value; } if (value instanceof Number) { return convertByte((Number) value); } if (value instanceof String) { return convertByte((String) value); } if (value instanceof Boolean) { return convertByte(((boolean) value)); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Byte"); } default byte convertByte(Number value) { return value.byteValue(); } default byte convertByte(String value) { return Byte.parseByte(value); } default byte convertByte(boolean value) { return value ? (byte) 1 : (byte) 0; } default boolean convertBoolean(T typeDefine, Object value) throws UnsupportedOperationException { if (value instanceof Boolean) { return (Boolean) value; } if (value instanceof Number) { return convertBoolean(typeDefine, (Number) value); } if (value instanceof String) { return convertBoolean(typeDefine, (String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Boolean, typeDefine: " + typeDefine); } default boolean convertBoolean(T typeDefine, Number value) { return convertBoolean(value); } default boolean convertBoolean(T typeDefine, String value) { return convertBoolean(value); } default boolean convertBoolean(Object value) throws UnsupportedOperationException { if (value instanceof Boolean) { return (Boolean) value; } if (value instanceof Number) { return convertBoolean((Number) value); } if (value instanceof String) { return convertBoolean((String) value); } throw new UnsupportedOperationException( "Unsupported convert " + value.getClass() + " to Boolean"); } default boolean convertBoolean(Number value) { return value.intValue() != 0; } default boolean convertBoolean(String value) { return Boolean.parseBoolean(value); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/BasicDataTypeConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; public interface BasicDataTypeConverter extends BasicTypeConverter, BasicDataConverter {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/BasicTypeConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.Column; import java.util.ArrayList; import java.util.Arrays; import java.util.List; public interface BasicTypeConverter extends TypeConverter { /** * Convert {@link CatalogTable} columns definition to external system's type definition. * * @param table * @param identifiers * @return */ default List reconvert(CatalogTable table, String... identifiers) { List typeDefines = new ArrayList<>(); for (Column column : table.getTableSchema().getColumns()) { T t = reconvert(column); if (table.getCatalogName().equals(identifier())) { t.setColumnType(column.getSourceType()); } if (identifiers != null) { Arrays.asList(identifiers) .forEach( id -> { if (id.equals(t.getName())) { t.setColumnType(column.getSourceType()); } }); } typeDefines.add(t); } return typeDefines; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/BasicTypeDefine.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import lombok.Builder; import lombok.Data; import lombok.experimental.Tolerate; import java.io.Serializable; @Data @Builder public class BasicTypeDefine implements Serializable { protected String name; // e.g. `varchar(10)` for MySQL protected String columnType; // e.g. `varchar` for MySQL protected String dataType; // It's jdbc sql type(java.sql.Types) not SeaTunnel SqlType protected int sqlType; protected T nativeType; // e.g. `varchar` length is 10 protected Long length; // e.g. `decimal(10, 2)` precision is 10 protected Long precision; // e.g. `decimal(10, 2)` scale is 2 or timestamp(6) scale is 6 protected Integer scale; // e.g. `tinyint unsigned` is true protected boolean unsigned; @Builder.Default protected boolean nullable = true; protected Object defaultValue; protected String comment; @Tolerate public BasicTypeDefine() {} } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/ConverterLoader.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import java.util.ArrayList; import java.util.List; import java.util.ServiceLoader; public class ConverterLoader { public static DataTypeConverter loadDataTypeConverter(String identifier) { return loadDataTypeConverter(identifier, Thread.currentThread().getContextClassLoader()); } public static DataTypeConverter loadDataTypeConverter( String identifier, ClassLoader classLoader) { List converters = discoverConverters(DataTypeConverter.class, classLoader); for (DataTypeConverter dataTypeConverter : converters) { if (dataTypeConverter.identifier().equals(identifier)) { return dataTypeConverter; } } throw new IllegalArgumentException( "No data type converter found for identifier: " + identifier); } public static DataConverter loadDataConverter(String identifier) { return loadDataConverter(identifier, Thread.currentThread().getContextClassLoader()); } public static DataConverter loadDataConverter(String identifier, ClassLoader classLoader) { List converters = discoverConverters(DataConverter.class, classLoader); for (DataConverter dataConverter : converters) { if (dataConverter.identifier().equals(identifier)) { return dataConverter; } } throw new IllegalArgumentException("No data converter found for identifier: " + identifier); } public static TypeConverter loadTypeConverter(String identifier) { return loadTypeConverter(identifier, Thread.currentThread().getContextClassLoader()); } public static TypeConverter loadTypeConverter(String identifier, ClassLoader classLoader) { List converters = discoverConverters(TypeConverter.class, classLoader); for (TypeConverter typeConverter : converters) { if (typeConverter.identifier().equals(identifier)) { return typeConverter; } } throw new IllegalArgumentException("No type converter found for identifier: " + identifier); } private static List discoverConverters(Class clazz, ClassLoader classLoader) { List converters = new ArrayList<>(); ServiceLoader.load(clazz, classLoader).forEach(t -> converters.add(t)); return converters; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/DataConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.io.Serializable; import java.util.function.BiFunction; import java.util.function.Function; /** * Data converter to transfer to/from external system data type. * * @param */ public interface DataConverter extends Serializable { String identifier(); /** * Convert an external system's data type to {@link SeaTunnelDataType#getTypeClass()}. * * @param typeDefine * @param value * @return */ Object convert(SeaTunnelDataType typeDefine, Object value); default Object convert(Column columnDefine, Object value) { return convert(columnDefine.getDataType(), value); } default Object convert(T typeDefine, Column columnDefine, Object value) { return convert(columnDefine, value); } default Object[] convert(T[] typeDefine, Column[] columnDefine, Object[] value) { for (int i = 0; i < value.length; i++) { value[i] = convert(typeDefine != null ? typeDefine[i] : null, columnDefine[i], value[i]); } return value; } default Object[] convert(Column[] columnDefine, Function valueApply) { Object[] fields = valueApply.apply(columnDefine); if (fields.length != columnDefine.length) { throw new IllegalStateException("columnDefine size not match"); } for (int i = 0; i < fields.length; i++) { fields[i] = convert(columnDefine[i], fields[i]); } return fields; } default Object[] convert( T[] typeDefine, Column[] columnDefine, BiFunction valueApply) { boolean hasTypeDefine = typeDefine != null; if (hasTypeDefine && typeDefine.length != columnDefine.length) { throw new IllegalStateException("typeDefine size not match"); } Object[] fields = valueApply.apply(typeDefine, columnDefine); if (fields.length != columnDefine.length) { throw new IllegalStateException("columnDefine size not match"); } for (int i = 0; i < fields.length; i++) { fields[i] = convert(hasTypeDefine ? typeDefine[i] : null, columnDefine[i], fields[i]); } return fields; } default Object reconvert(T typeDefine, Column columnDefine, Object value) { return reconvert(typeDefine, value); } /** * Convert object to an external system's data type. * * @param typeDefine * @param value * @return */ default Object reconvert(T typeDefine, Object value) { throw new UnsupportedOperationException("reconvert not support"); } default Object reconvert(Column columnDefine, Object value) { return reconvert(columnDefine.getDataType(), value); } /** * Convert {@link SeaTunnelDataType#getTypeClass()} to an external system's data type. * * @param typeDefine * @param value * @return */ default Object reconvert(SeaTunnelDataType typeDefine, Object value) { throw new UnsupportedOperationException("reconvert not support"); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/DataTypeConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; public interface DataTypeConverter extends TypeConverter, DataConverter {} ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/converter/TypeConverter.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.converter; import org.apache.seatunnel.api.table.catalog.Column; import java.io.Serializable; import java.util.List; import java.util.stream.Collectors; /** * Type converter to transfer to/from external system types. * * @param */ public interface TypeConverter extends Serializable { String identifier(); /** * Convert an external system's type definition to {@link Column}. * * @param typeDefine type define * @return column */ Column convert(T typeDefine); default List convert(List typeDefines) { return typeDefines.stream().map(this::convert).collect(Collectors.toList()); } /** * Convert {@link Column} to an external system's type definition. * * @param column * @return */ T reconvert(Column column); default List reconvert(List columns) { return columns.stream().map(this::reconvert).collect(Collectors.toList()); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/CatalogFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.Catalog; public interface CatalogFactory extends Factory { /** Creates a {@link Catalog} using the options. */ Catalog createCatalog(String catalogName, ReadonlyConfig options); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/ChangeStreamTableSourceCheckpoint.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import lombok.AllArgsConstructor; import lombok.Data; import java.io.Serializable; import java.util.List; @Data @AllArgsConstructor public class ChangeStreamTableSourceCheckpoint implements Serializable { // The state of the enumerator, from checkpoint data private byte[] enumeratorState; // The splits of the enumerator, from checkpoint data public List> splits; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/ChangeStreamTableSourceFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.serialization.DefaultSerializer; import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceSplit; import org.apache.seatunnel.api.table.connector.TableSource; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collections; import java.util.List; /** * A factory to create a {@link TableSource} for a {@link SeaTunnelSource} that supports change * stream. e.g. CDC/MQ Source The factory can be used to restore the source from the checkpoint * state. The factory can also be used to serialize and deserialize the checkpoint state. */ public interface ChangeStreamTableSourceFactory extends TableSourceFactory { /** * see {@link SeaTunnelSource#getSplitSerializer()}. * * @return * @param */ default Serializer getSplitSerializer() { return new DefaultSerializer<>(); } /** * see {@link SeaTunnelSource#getEnumeratorStateSerializer()}. * * @return * @param */ default Serializer getEnumeratorStateSerializer() { return new DefaultSerializer<>(); } /** * Create a {@link ChangeStreamTableSourceState} from the given {@link * ChangeStreamTableSourceCheckpoint}. The default implementation uses the {@link * #getSplitSerializer()} and {@link #getEnumeratorStateSerializer()} to deserialize the splits * and enumerator state. * *

    If the splits or enumerator state is null, the corresponding field in the returned state * will be null. * * @param checkpoint * @return * @param * @param * @throws IOException */ default ChangeStreamTableSourceState deserializeTableSourceState( ChangeStreamTableSourceCheckpoint checkpoint) throws IOException { StateT enumeratorState = null; if (checkpoint.getEnumeratorState() != null) { Serializer enumeratorStateSerializer = getEnumeratorStateSerializer(); enumeratorState = enumeratorStateSerializer.deserialize(checkpoint.getEnumeratorState()); } List> deserializedSplits = new ArrayList<>(); if (checkpoint.getSplits() != null && !checkpoint.getSplits().isEmpty()) { Serializer splitSerializer = getSplitSerializer(); List> splits = checkpoint.getSplits(); for (int i = 0; i < splits.size(); i++) { List subTaskSplits = splits.get(i); if (subTaskSplits == null || subTaskSplits.isEmpty()) { deserializedSplits.add(Collections.emptyList()); } else { List deserializedSubTaskSplits = new ArrayList<>(subTaskSplits.size()); for (byte[] split : subTaskSplits) { if (split != null) { deserializedSubTaskSplits.add(splitSerializer.deserialize(split)); } } deserializedSplits.add(deserializedSubTaskSplits); } } } return new ChangeStreamTableSourceState<>(enumeratorState, deserializedSplits); } /** * Restore the source from the checkpoint state. * * @param context * @param state checkpoint state * @return * @param * @param * @param */ TableSource restoreSource( TableSourceFactoryContext context, ChangeStreamTableSourceState state); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/ChangeStreamTableSourceState.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.source.SourceSplit; import lombok.AllArgsConstructor; import lombok.Data; import java.io.Serializable; import java.util.List; /** * The state of the enumerator and splits of the enumerator, which is used to resume the enumerator * and reader. * * @param * @param */ @Data @AllArgsConstructor public class ChangeStreamTableSourceState { // The state of the enumerator, which is used to resume the enumerator. private StateT enumeratorState; // The splits of the enumerator, which is used to resume the reader. public List> splits; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/DataTypeConvertorFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.table.catalog.DataTypeConvertor; import java.util.HashMap; import java.util.Map; import java.util.ServiceLoader; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public class DataTypeConvertorFactory { private final Map> dataTypeConvertorMap = new HashMap<>(); public DataTypeConvertorFactory() { this(Thread.currentThread().getContextClassLoader()); } public DataTypeConvertorFactory(ClassLoader classLoader) { ServiceLoader.load(DataTypeConvertor.class, classLoader) .forEach( dataTypeConvertor -> { dataTypeConvertorMap.put( dataTypeConvertor.getIdentity().toUpperCase(), dataTypeConvertor); }); } public DataTypeConvertor getDataTypeConvertor(String convertorIdentify) { checkNotNull(convertorIdentify, "connectorIdentify can not be null"); if (dataTypeConvertorMap.containsKey(convertorIdentify.toUpperCase())) { return dataTypeConvertorMap.get(convertorIdentify.toUpperCase()); } throw new IllegalArgumentException( "connectorIdentify " + convertorIdentify + " is not supported"); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/Factory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.configuration.util.OptionRule; /** todo: use PluginIdentifier. This is the SPI interface. */ public interface Factory { /** * Returns a unique identifier among same factory interfaces. * *

    For consistency, an identifier should be declared as one lower case word (e.g. {@code * kafka}). If multiple factories exist for different versions, a version should be appended * using "-" (e.g. {@code elasticsearch-7}). */ String factoryIdentifier(); /** * Returns the rule for options. * *

    1. Used to verify whether the parameters configured by the user conform to the rules of * the options; * *

    2. Used for Web-UI to prompt user to configure option value; */ OptionRule optionRule(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; public class FactoryException extends SeaTunnelRuntimeException { public FactoryException(String message, Throwable cause) { super(SeaTunnelAPIErrorCode.FACTORY_INITIALIZE_FAILED, message, cause); } public FactoryException(String message) { super(SeaTunnelAPIErrorCode.FACTORY_INITIALIZE_FAILED, message); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/FactoryUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.common.JobContext; import org.apache.seatunnel.api.common.PluginIdentifier; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.configuration.util.ConfigValidator; import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.env.ParsingMode; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.options.EnvCommonOptions; import org.apache.seatunnel.api.options.SourceConnectorCommonOptions; import org.apache.seatunnel.api.sink.SeaTunnelSink; import org.apache.seatunnel.api.sink.multitablesink.MultiTableSinkFactory; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceSplit; import org.apache.seatunnel.api.source.SupportParallelism; import org.apache.seatunnel.api.table.catalog.Catalog; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.connector.TableSource; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.transform.SeaTunnelTransform; import org.apache.seatunnel.common.constants.EngineType; import org.apache.seatunnel.common.constants.JobMode; import org.apache.seatunnel.common.constants.PluginType; import org.apache.seatunnel.common.utils.ExceptionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import lombok.NonNull; import lombok.extern.slf4j.Slf4j; import scala.Tuple2; import java.io.Serializable; import java.net.URL; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.ServiceConfigurationError; import java.util.ServiceLoader; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import static org.apache.seatunnel.api.options.ConnectorCommonOptions.PLUGIN_NAME; /** * Use SPI to create {@link TableSourceFactory}, {@link TableSinkFactory} and {@link * CatalogFactory}. */ @Slf4j public final class FactoryUtil { private static final Logger LOG = LoggerFactory.getLogger(FactoryUtil.class); public static final String DEFAULT_ID = "default-identifier"; public static Tuple2, List> createAndPrepareSource( ReadonlyConfig options, ClassLoader classLoader, String factoryIdentifier, Function fallbackCreateSource, TableSourceFactory factory, ReadonlyConfig envOptions) { return restoreAndPrepareSource( options, classLoader, factoryIdentifier, null, fallbackCreateSource, factory, envOptions); } public static Tuple2, List> restoreAndPrepareSource( ReadonlyConfig options, ClassLoader classLoader, String factoryIdentifier, ChangeStreamTableSourceCheckpoint checkpoint, Function fallbackCreateSource, TableSourceFactory factory, ReadonlyConfig envOptions) { try { SeaTunnelSource source; final String factoryId = options.get(PLUGIN_NAME); boolean fallback = isFallback( classLoader, TableSourceFactory.class, factoryId, (sourceFactory) -> sourceFactory.createSource(null)); if (fallback) { source = fallbackCreateSource.apply( PluginIdentifier.of( EngineType.SEATUNNEL.getEngine(), PluginType.SOURCE.getType(), factoryId)); source.prepare(options.toConfig()); } else { if (factory == null) { factory = discoverFactory( classLoader, TableSourceFactory.class, factoryIdentifier); } if (factory instanceof ChangeStreamTableSourceFactory && checkpoint != null) { ChangeStreamTableSourceFactory changeStreamTableSourceFactory = (ChangeStreamTableSourceFactory) factory; ChangeStreamTableSourceState state = changeStreamTableSourceFactory.deserializeTableSourceState(checkpoint); source = restoreAndPrepareSource( changeStreamTableSourceFactory, options, classLoader, state); } else { source = createAndPrepareSource(factory, options, classLoader, envOptions); } } List catalogTables; try { catalogTables = source.getProducedCatalogTables(); } catch (UnsupportedOperationException e) { // TODO remove it when all connector use `getProducedCatalogTables` SeaTunnelDataType seaTunnelDataType = source.getProducedType(); final String tableId = options.getOptional(ConnectorCommonOptions.PLUGIN_OUTPUT) .orElse(DEFAULT_ID); catalogTables = CatalogTableUtil.convertDataTypeToCatalogTables(seaTunnelDataType, tableId); } LOG.info( "get the CatalogTable from source {}: {}", source.getPluginName(), catalogTables.stream() .map(CatalogTable::getTableId) .map(TableIdentifier::toString) .collect(Collectors.joining(","))); if (options.get(SourceConnectorCommonOptions.DAG_PARSING_MODE) == ParsingMode.SHARDING) { CatalogTable catalogTable = catalogTables.get(0); catalogTables.clear(); catalogTables.add(catalogTable); } return new Tuple2<>(source, catalogTables); } catch (Throwable t) { throw new FactoryException( String.format( "Unable to create a source for identifier '%s'.", factoryIdentifier), t); } } private static SeaTunnelSource createAndPrepareSource( TableSourceFactory factory, ReadonlyConfig options, ClassLoader classLoader, ReadonlyConfig envOptions) { TableSourceFactoryContext context = new TableSourceFactoryContext(options, classLoader, envOptions); ConfigValidator.of(context.getOptions()).validate(factory.optionRule()); TableSource tableSource = factory.createSource(context); return tableSource.createSource(); } private static SeaTunnelSource restoreAndPrepareSource( ChangeStreamTableSourceFactory factory, ReadonlyConfig options, ClassLoader classLoader, ChangeStreamTableSourceState state) { TableSourceFactoryContext context = new TableSourceFactoryContext(options, classLoader); ConfigValidator.of(context.getOptions()).validate(factory.optionRule()); LOG.info("Restore create source from checkpoint state: {}", state); TableSource tableSource = factory.restoreSource(context, state); return tableSource.createSource(); } public static SeaTunnelSink createAndPrepareSink( CatalogTable catalogTable, ReadonlyConfig config, ClassLoader classLoader, String factoryIdentifier, Function fallbackCreateSink, TableSinkFactory tableSinkFactory) { try { final String factoryId = config.get(PLUGIN_NAME); boolean fallback = isFallback( classLoader, TableSinkFactory.class, factoryId, (sinkFactory) -> sinkFactory.createSink(null)); if (fallback) { SeaTunnelSink sink = fallbackCreateSink.apply( PluginIdentifier.of( EngineType.SEATUNNEL.getEngine(), PluginType.SINK.getType(), factoryId)); sink.prepare(config.toConfig()); sink.setTypeInfo(catalogTable.getSeaTunnelRowType()); return sink; } if (tableSinkFactory == null) { tableSinkFactory = discoverFactory(classLoader, TableSinkFactory.class, factoryIdentifier); } TableSinkFactoryContext context = TableSinkFactoryContext.replacePlaceholderAndCreate( catalogTable, config, classLoader, tableSinkFactory.excludeTablePlaceholderReplaceKeys()); ConfigValidator.of(context.getOptions()).validate(tableSinkFactory.optionRule()); LOG.info( "Create sink '{}' with upstream input catalog-table[database: {}, schema: {}, table: {}]", factoryIdentifier, catalogTable.getTablePath().getDatabaseName(), catalogTable.getTablePath().getSchemaName(), catalogTable.getTablePath().getTableName()); return tableSinkFactory.createSink(context).createSink(); } catch (Throwable t) { throw new FactoryException( String.format( "Unable to create a sink for identifier '%s'.", factoryIdentifier), t); } } public static SeaTunnelSink createMultiTableSink( Map sinks, ReadonlyConfig options, ClassLoader classLoader) { try { TableSinkFactory factory = new MultiTableSinkFactory(); MultiTableFactoryContext context = new MultiTableFactoryContext(options, classLoader, sinks); ConfigValidator.of(context.getOptions()).validate(factory.optionRule()); return factory.createSink(context).createSink(); } catch (Throwable t) { throw new FactoryException( "Unable to create a sink for identifier 'MultiTableSink'.", t); } } public static Optional createOptionalCatalog( String catalogName, ReadonlyConfig options, ClassLoader classLoader, String factoryIdentifier) { Optional optionalFactory = discoverOptionalFactory(classLoader, CatalogFactory.class, factoryIdentifier); return optionalFactory.map( catalogFactory -> catalogFactory.createCatalog(catalogName, options)); } public static URL getFactoryUrl(T factory) { return factory.getClass().getProtectionDomain().getCodeSource().getLocation(); } public static Optional discoverOptionalFactory( ClassLoader classLoader, Class factoryClass, String factoryIdentifier, Function discoverOptionalFactoryFunction) { if (discoverOptionalFactoryFunction != null) { T apply = discoverOptionalFactoryFunction.apply(factoryIdentifier); if (apply != null) { return Optional.of(apply); } else { return Optional.empty(); } } return discoverOptionalFactory(classLoader, factoryClass, factoryIdentifier); } public static Optional discoverOptionalFactory( ClassLoader classLoader, Class factoryClass, String factoryIdentifier) { final List foundFactories = discoverFactories(classLoader, factoryClass); if (foundFactories.isEmpty()) { return Optional.empty(); } final List matchingFactories = foundFactories.stream() .filter(f -> f.factoryIdentifier().equalsIgnoreCase(factoryIdentifier)) .collect(Collectors.toList()); if (matchingFactories.isEmpty()) { return Optional.empty(); } checkMultipleMatchingFactories(factoryIdentifier, factoryClass, matchingFactories); return Optional.of(matchingFactories.get(0)); } public static T discoverFactory( ClassLoader classLoader, Class factoryClass, String factoryIdentifier) { final List foundFactories = discoverFactories(classLoader, factoryClass); if (foundFactories.isEmpty()) { throw new FactoryException( String.format( "Could not find any factories that implement '%s' in the classpath.", factoryClass.getName())); } final List matchingFactories = foundFactories.stream() .filter(f -> f.factoryIdentifier().equalsIgnoreCase(factoryIdentifier)) .collect(Collectors.toList()); if (matchingFactories.isEmpty()) { throw new FactoryException( String.format( "Could not find any factory for identifier '%s' that implements '%s' in the classpath.\n\n" + "Available factory identifiers are:\n\n" + "%s", factoryIdentifier, factoryClass.getName(), foundFactories.stream() .map(Factory::factoryIdentifier) .distinct() .sorted() .collect(Collectors.joining("\n")))); } checkMultipleMatchingFactories(factoryIdentifier, factoryClass, matchingFactories); return matchingFactories.get(0); } private static void checkMultipleMatchingFactories( String factoryIdentifier, Class factoryClass, List matchingFactories) { if (matchingFactories.size() > 1) { throw new FactoryException( String.format( "Multiple factories for identifier '%s' that implement '%s' found in the classpath.\n\n" + "Ambiguous factory classes are:\n\n" + "%s", factoryIdentifier, factoryClass.getName(), matchingFactories.stream() .map(f -> f.getClass().getName()) .sorted() .collect(Collectors.joining("\n")))); } } @SuppressWarnings("unchecked") public static List discoverFactories( ClassLoader classLoader, Class factoryClass) { return discoverFactories(classLoader).stream() .filter(f -> factoryClass.isAssignableFrom(f.getClass())) .map(f -> (T) f) .collect(Collectors.toList()); } public static List discoverFactories(ClassLoader classLoader) { try { final List result = new LinkedList<>(); ServiceLoader.load(Factory.class, classLoader).iterator().forEachRemaining(result::add); return result; } catch (ServiceConfigurationError e) { LOG.error("Could not load service provider for factories.", e); throw new FactoryException("Could not load service provider for factories.", e); } } /** * This method is called by SeaTunnel Web to get the full option rule of a source. * * @return Option rule */ public static OptionRule sourceFullOptionRule(@NonNull TableSourceFactory factory) { OptionRule sourceOptionRule = factory.optionRule(); if (sourceOptionRule == null) { throw new FactoryException("sourceOptionRule can not be null"); } Class sourceClass = factory.getSourceClass(); if (factory instanceof SupportParallelism // TODO: Implement SupportParallelism in the TableSourceFactory instead of the // SeaTunnelSource || SupportParallelism.class.isAssignableFrom(sourceClass)) { OptionRule sourceCommonOptionRule = OptionRule.builder().optional(EnvCommonOptions.PARALLELISM).build(); sourceOptionRule .getOptionalOptions() .addAll(sourceCommonOptionRule.getOptionalOptions()); } return sourceOptionRule; } /** * This method is called by SeaTunnel Web to get the full option rule of a sink. * * @return Option rule */ public static OptionRule sinkFullOptionRule(@NonNull TableSinkFactory factory) { OptionRule sinkOptionRule = factory.optionRule(); if (sinkOptionRule == null) { throw new FactoryException("sinkOptionRule can not be null"); } return sinkOptionRule; } public static SeaTunnelTransform createAndPrepareMultiTableTransform( List catalogTables, ReadonlyConfig options, ClassLoader classLoader, String factoryIdentifier) { final TableTransformFactory factory = discoverFactory(classLoader, TableTransformFactory.class, factoryIdentifier); TableTransformFactoryContext context = new TableTransformFactoryContext(catalogTables, options, classLoader); ConfigValidator.of(context.getOptions()).validate(factory.optionRule()); return factory.createTransform(context).createTransform(); } private static boolean isFallback( ClassLoader classLoader, Class factoryClass, String factoryId, Consumer virtualCreator) { Optional factory = discoverOptionalFactory(classLoader, factoryClass, factoryId); if (!factory.isPresent()) { return true; } try { virtualCreator.accept(factory.get()); } catch (Exception e) { if (e instanceof UnsupportedOperationException && "The Factory has not been implemented and the deprecated Plugin will be used." .equals(e.getMessage())) { return true; } log.debug(ExceptionUtils.getMessage(e)); } return false; } public static void ensureJobModeMatch(JobContext jobContext, SeaTunnelSource source) { if (jobContext.getJobMode() == JobMode.BATCH && source.getBoundedness() == org.apache.seatunnel.api.source.Boundedness.UNBOUNDED) { throw new UnsupportedOperationException( String.format( "'%s' source don't support off-line job.", source.getPluginName())); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/MultiTableFactoryContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.sink.SeaTunnelSink; import org.apache.seatunnel.api.table.catalog.TablePath; import lombok.Getter; import java.util.Map; @Getter public class MultiTableFactoryContext extends TableSinkFactoryContext { private final Map sinks; public MultiTableFactoryContext( ReadonlyConfig options, ClassLoader classLoader, Map sinks) { super(null, options, classLoader); this.sinks = sinks; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/SerializationFormatFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.table.connector.SerializationFormat; public interface SerializationFormatFactory extends Factory { SerializationFormat createSerializationFormat(TableFactoryContext context); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableFactoryContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.common.utils.SeaTunnelException; import lombok.Getter; import java.util.ArrayList; import java.util.List; @Getter public abstract class TableFactoryContext { private final ReadonlyConfig options; private final ClassLoader classLoader; public TableFactoryContext(ReadonlyConfig options, ClassLoader classLoader) { this.options = options; this.classLoader = classLoader; } protected static void checkCatalogTableIllegal(List catalogTables) { for (CatalogTable catalogTable : catalogTables) { List alreadyChecked = new ArrayList<>(); for (String fieldName : catalogTable.getTableSchema().getFieldNames()) { if (StringUtils.isBlank(fieldName)) { throw new SeaTunnelException( String.format( "Table %s field name cannot be empty", catalogTable.getTablePath().getFullName())); } if (alreadyChecked.contains(fieldName)) { throw new SeaTunnelException( String.format( "Table %s field %s duplicate", catalogTable.getTablePath().getFullName(), fieldName)); } alreadyChecked.add(fieldName); } } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableSinkFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.table.connector.TableSink; import java.util.Collections; import java.util.List; /** * This is an SPI interface, used to create {@link TableSink}. Each plugin need to have it own * implementation. * * @param row type * @param state type * @param commit info type * @param aggregated commit info type */ public interface TableSinkFactory extends Factory { /** * We will never use this method now. So gave a default implement and return null. * * @param context TableFactoryContext * @return return the sink created by this factory */ default TableSink createSink( TableSinkFactoryContext context) { throw new UnsupportedOperationException( "The Factory has not been implemented and the deprecated Plugin will be used."); } @Deprecated default List excludeTablePlaceholderReplaceKeys() { return Collections.emptyList(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableSinkFactoryContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.sink.TablePlaceholderProcessor; import org.apache.seatunnel.api.table.catalog.CatalogTable; import lombok.Getter; import java.util.Collection; import java.util.Collections; @Getter public class TableSinkFactoryContext extends TableFactoryContext { private final CatalogTable catalogTable; @VisibleForTesting public TableSinkFactoryContext( CatalogTable catalogTable, ReadonlyConfig options, ClassLoader classLoader) { super(options, classLoader); if (catalogTable != null) { checkCatalogTableIllegal(Collections.singletonList(catalogTable)); } this.catalogTable = catalogTable; } public static TableSinkFactoryContext replacePlaceholderAndCreate( CatalogTable catalogTable, ReadonlyConfig options, ClassLoader classLoader, Collection excludeTablePlaceholderReplaceKeys) { ReadonlyConfig rewriteConfig = TablePlaceholderProcessor.replaceTablePlaceholder( options, catalogTable, excludeTablePlaceholderReplaceKeys); return new TableSinkFactoryContext(catalogTable, rewriteConfig, classLoader); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableSourceFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.metalake.TableSchemaDiscoverer; import org.apache.seatunnel.api.source.SeaTunnelSource; import org.apache.seatunnel.api.source.SourceSplit; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.connector.TableSource; import java.io.Serializable; import java.util.List; /** * This is an SPI interface, used to create {@link TableSource}. Each plugin need to have it own * implementation. */ public interface TableSourceFactory extends Factory { /** * We will never use this method now. So gave a default implement and return null. * * @param context TableFactoryContext */ default TableSource createSource(TableSourceFactoryContext context) { throw new UnsupportedOperationException( "The Factory has not been implemented and the deprecated Plugin will be used."); } /** * We can get the catalogTable list in the source configuration through this method * * @param context TableFactoryContext */ default List discoverTableSchemas(TableSourceFactoryContext context) { try (TableSchemaDiscoverer metaLakeSchemaDiscoverer = new TableSchemaDiscoverer(context, factoryIdentifier())) { return metaLakeSchemaDiscoverer.discoverTableSchemas(); } } /** * TODO: Implement SupportParallelism in the TableSourceFactory instead of the SeaTunnelSource, * Then deprecated the method */ Class getSourceClass(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableSourceFactoryContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import lombok.Getter; @Getter public class TableSourceFactoryContext extends TableFactoryContext { private ReadonlyConfig envOptions; public TableSourceFactoryContext(ReadonlyConfig options, ClassLoader classLoader) { super(options, classLoader); } public TableSourceFactoryContext( ReadonlyConfig options, ClassLoader classLoader, ReadonlyConfig envOptions) { super(options, classLoader); this.envOptions = envOptions; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableTransformFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.table.connector.TableTransform; /** * This is an SPI interface, used to create {@link * org.apache.seatunnel.api.table.connector.TableTransform}. Each plugin need to have it own * implementation. */ public interface TableTransformFactory extends Factory { /** * We will never use this method now. So gave a default implement and return null. * * @param context TableFactoryContext * @return */ default TableTransform createTransform(TableTransformFactoryContext context) { throw new UnsupportedOperationException( "The Factory has not been implemented and the deprecated Plugin will be used."); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/factory/TableTransformFactoryContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.factory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.CatalogTable; import lombok.Getter; import java.util.List; @Getter public class TableTransformFactoryContext extends TableFactoryContext { private final List catalogTables; public TableTransformFactoryContext( List catalogTables, ReadonlyConfig options, ClassLoader classLoader) { super(options, classLoader); checkCatalogTableIllegal(catalogTables); this.catalogTables = catalogTables; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/SchemaChangeType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema; public enum SchemaChangeType { /** Add column to table. */ ADD_COLUMN, /** Drop column from table. */ DROP_COLUMN, /** Update column in table. */ UPDATE_COLUMN, /** Rename column in table. */ RENAME_COLUMN; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableAddColumnEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.Getter; import lombok.ToString; @Getter @ToString(callSuper = true) public class AlterTableAddColumnEvent extends AlterTableColumnEvent { private final Column column; private final boolean first; private final String afterColumn; public AlterTableAddColumnEvent( TableIdentifier tableIdentifier, Column column, boolean first, String afterColumn) { super(tableIdentifier); this.column = column; this.first = first; this.afterColumn = afterColumn; } public static AlterTableAddColumnEvent addFirst( TableIdentifier tableIdentifier, Column column) { return new AlterTableAddColumnEvent(tableIdentifier, column, true, null); } public static AlterTableAddColumnEvent add(TableIdentifier tableIdentifier, Column column) { return new AlterTableAddColumnEvent(tableIdentifier, column, false, null); } public static AlterTableAddColumnEvent addAfter( TableIdentifier tableIdentifier, Column column, String afterColumn) { return new AlterTableAddColumnEvent(tableIdentifier, column, false, afterColumn); } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_ADD_COLUMN; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableChangeColumnEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.Getter; import lombok.ToString; @Getter @ToString(callSuper = true) public class AlterTableChangeColumnEvent extends AlterTableColumnEvent { private final Column column; private final boolean first; private final String afterColumn; private final String oldColumn; public AlterTableChangeColumnEvent( TableIdentifier tableIdentifier, String oldColumn, Column column, boolean first, String afterColumn) { super(tableIdentifier); this.oldColumn = oldColumn; this.column = column; this.first = first; this.afterColumn = afterColumn; } public static AlterTableChangeColumnEvent changeFirst( TableIdentifier tableIdentifier, String oldColumn, Column column) { return new AlterTableChangeColumnEvent(tableIdentifier, oldColumn, column, true, null); } public static AlterTableChangeColumnEvent change( TableIdentifier tableIdentifier, String oldColumn, Column column) { return new AlterTableChangeColumnEvent(tableIdentifier, oldColumn, column, false, null); } public static AlterTableChangeColumnEvent changeAfter( TableIdentifier tableIdentifier, String oldColumn, Column column, String afterColumn) { return new AlterTableChangeColumnEvent( tableIdentifier, oldColumn, column, false, afterColumn); } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_CHANGE_COLUMN; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableColumnEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.ToString; @ToString(callSuper = true) public abstract class AlterTableColumnEvent extends AlterTableEvent { public AlterTableColumnEvent(TableIdentifier tableIdentifier) { super(tableIdentifier); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableColumnsEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.Getter; import lombok.ToString; import java.util.ArrayList; import java.util.List; @Getter @ToString(callSuper = true) public class AlterTableColumnsEvent extends AlterTableEvent { private final List events; public AlterTableColumnsEvent(TableIdentifier tableIdentifier) { this(tableIdentifier, new ArrayList<>()); } public AlterTableColumnsEvent( TableIdentifier tableIdentifier, List events) { super(tableIdentifier); this.events = events; } public AlterTableColumnsEvent addEvent(AlterTableColumnEvent event) { events.add(event); return this; } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_UPDATE_COLUMNS; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableDropColumnEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.Getter; import lombok.ToString; @Getter @ToString(callSuper = true) public class AlterTableDropColumnEvent extends AlterTableColumnEvent { private final String column; public AlterTableDropColumnEvent(TableIdentifier tableIdentifier, String column) { super(tableIdentifier); this.column = column; } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_DROP_COLUMN; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.ToString; @ToString(callSuper = true) public abstract class AlterTableEvent extends TableEvent { public AlterTableEvent(TableIdentifier tableIdentifier) { super(tableIdentifier); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableModifyColumnEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import lombok.Getter; import lombok.ToString; @Getter @ToString(callSuper = true) public class AlterTableModifyColumnEvent extends AlterTableColumnEvent { private final Column column; private final boolean first; private Boolean typeChanged; private final String afterColumn; public AlterTableModifyColumnEvent( TableIdentifier tableIdentifier, Column column, boolean first, String afterColumn) { super(tableIdentifier); this.column = column; this.first = first; this.afterColumn = afterColumn; } public void setTypeChanged(boolean typeChanged) { this.typeChanged = typeChanged; } public static AlterTableModifyColumnEvent modifyFirst( TableIdentifier tableIdentifier, Column column) { return new AlterTableModifyColumnEvent(tableIdentifier, column, true, null); } public static AlterTableModifyColumnEvent modify( TableIdentifier tableIdentifier, Column column) { return new AlterTableModifyColumnEvent(tableIdentifier, column, false, null); } public static AlterTableModifyColumnEvent modifyAfter( TableIdentifier tableIdentifier, Column column, String afterColumn) { return new AlterTableModifyColumnEvent(tableIdentifier, column, false, afterColumn); } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_MODIFY_COLUMN; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/AlterTableNameEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import lombok.Getter; import lombok.ToString; @Getter @ToString(callSuper = true) public class AlterTableNameEvent extends AlterTableEvent { private final TableIdentifier newTableIdentifier; public AlterTableNameEvent( TableIdentifier tableIdentifier, TableIdentifier newTableIdentifier) { super(tableIdentifier); this.newTableIdentifier = newTableIdentifier; } public TablePath getNewTablePath() { return newTableIdentifier.toTablePath(); } @Override public EventType getEventType() { return EventType.SCHEMA_CHANGE_RENAME_TABLE; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/SchemaChangeEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.Event; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; /** Represents a structural change to a table schema. */ public interface SchemaChangeEvent extends Event { /** * Path of the change table object * * @return */ default TablePath tablePath() { return tableIdentifier().toTablePath(); } /** * Path of the change table object * * @return */ TableIdentifier tableIdentifier(); /** * Get the table struct after the change * * @return */ CatalogTable getChangeAfter(); /** * Set the table struct after the change * * @param table */ void setChangeAfter(CatalogTable table); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/event/TableEvent.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import lombok.Getter; import lombok.RequiredArgsConstructor; import lombok.Setter; import lombok.ToString; @Getter @ToString @RequiredArgsConstructor public abstract class TableEvent implements SchemaChangeEvent { private long createdTime = System.currentTimeMillis(); protected final TableIdentifier tableIdentifier; @Getter @Setter private String jobId; @Getter @Setter private String statement; @Getter @Setter protected String sourceDialectName; @Getter @Setter private CatalogTable changeAfter; @Override public TableIdentifier tableIdentifier() { return tableIdentifier; } public TablePath getTablePath() { return tablePath(); } @Override public long getCreatedTime() { return createdTime; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/exception/SchemaCoordinationException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.exception; import org.apache.seatunnel.api.table.catalog.TableIdentifier; /** * Exception thrown when schema coordination operations fail. This includes timeout issues, * coordination conflicts, and coordinator state problems. */ public class SchemaCoordinationException extends SchemaEvolutionException { public SchemaCoordinationException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId) { super(errorCode, errorMessage, tableIdentifier, jobId); } public SchemaCoordinationException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId, Throwable cause) { super(errorCode, errorMessage, tableIdentifier, jobId, cause); } /** Create a timeout exception for schema changes */ public static SchemaCoordinationException timeout( TableIdentifier tableIdentifier, String jobId, long timeoutSeconds, Throwable cause) { String message = String.format("Schema change operation timed out after %d seconds", timeoutSeconds); return new SchemaCoordinationException( SchemaEvolutionErrorCode.SCHEMA_CHANGE_TIMEOUT, message, tableIdentifier, jobId, cause); } /** Create an exception for schema change conflicts */ public static SchemaCoordinationException conflict( TableIdentifier tableIdentifier, String currentJobId, String conflictingJobId) { String message = String.format( "Schema change already in progress for table. Current job: %s, conflicting job: %s", currentJobId, conflictingJobId); return new SchemaCoordinationException( SchemaEvolutionErrorCode.SCHEMA_CHANGE_ALREADY_IN_PROGRESS, message, tableIdentifier, currentJobId); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/exception/SchemaEvolutionErrorCode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.exception; import org.apache.seatunnel.common.exception.SeaTunnelErrorCode; public enum SchemaEvolutionErrorCode implements SeaTunnelErrorCode { // schema coordination errors SCHEMA_COORDINATOR_NOT_INITIALIZED("SE-01", "Schema coordinator is not initialized"), SCHEMA_CHANGE_ALREADY_IN_PROGRESS( "SE-02", "Schema change is already in progress for the table"), SCHEMA_CHANGE_TIMEOUT("SE-03", "Schema change operation timed out"), SCHEMA_CHANGE_COORDINATION_FAILED("SE-04", "Schema change coordination failed"), // schema validation errors INVALID_SCHEMA_STRUCTURE("SE-05", "Invalid schema structure provided"), OUTDATED_SCHEMA_EVENT("SE-06", "Schema change event is outdated"), UNSUPPORTED_SCHEMA_CHANGE_TYPE("SE-07", "Schema change type is not supported"), // sink writer errors SCHEMA_CHANGE_APPLICATION_FAILED("SE-08", "Failed to apply schema change to sink writer"), FLUSH_OPERATION_FAILED("SE-09", "Flush operation failed during schema evolution"), // event processing errors SCHEMA_EVENT_PROCESSING_FAILED("SE-10", "Failed to process schema change event"), // meta lake schema GET_META_LAKE_TABLE_SCHEMA_FAILED("SE-11", "Get meta lake table schema failed"), ERROR_INVALID_TABLE_URL( "SE-12", "Invalid table URL format, expected: /catalogs/{catalog}/schemas/{schema}/tables/{table}"), CATALOG_TABLE_SIZE_IS_ERROR("SE-13", "Catalog table size is error"); private final String code; private final String description; SchemaEvolutionErrorCode(String code, String description) { this.code = code; this.description = description; } @Override public String getCode() { return this.code; } @Override public String getDescription() { return this.description; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/exception/SchemaEvolutionException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.exception; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import lombok.Getter; /** Base exception class for schema evolution related errors. */ @Getter public class SchemaEvolutionException extends SeaTunnelRuntimeException { private final TableIdentifier tableIdentifier; private final String jobId; public SchemaEvolutionException(SchemaEvolutionErrorCode errorCode, String errorMessage) { super(errorCode, errorMessage); this.tableIdentifier = null; this.jobId = null; } public SchemaEvolutionException( SchemaEvolutionErrorCode errorCode, String errorMessage, Throwable cause) { super(errorCode, errorMessage, cause); this.tableIdentifier = null; this.jobId = null; } public SchemaEvolutionException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId) { super(errorCode, enrichErrorMessage(errorMessage, tableIdentifier, jobId)); this.tableIdentifier = tableIdentifier; this.jobId = jobId; } public SchemaEvolutionException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId, Throwable cause) { super(errorCode, enrichErrorMessage(errorMessage, tableIdentifier, jobId), cause); this.tableIdentifier = tableIdentifier; this.jobId = jobId; } private static String enrichErrorMessage( String originalMessage, TableIdentifier tableIdentifier, String jobId) { StringBuilder message = new StringBuilder(originalMessage); if (tableIdentifier != null) { message.append(" [Table: ").append(tableIdentifier).append("]"); } if (jobId != null) { message.append(" [Job: ").append(jobId).append("]"); } return message.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/exception/SchemaValidationException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.exception; import org.apache.seatunnel.api.table.catalog.TableIdentifier; /** * Exception thrown when schema validation fails. This includes invalid schema structures, outdated * events. */ public class SchemaValidationException extends SchemaEvolutionException { public SchemaValidationException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId) { super(errorCode, errorMessage, tableIdentifier, jobId); } /** Create an exception for unsupported schema change types */ public static SchemaValidationException unsupportedChangeType( TableIdentifier tableIdentifier, String jobId) { return new SchemaValidationException( SchemaEvolutionErrorCode.UNSUPPORTED_SCHEMA_CHANGE_TYPE, "Schema change type '%s' is not supported", tableIdentifier, jobId); } /** Create an exception for outdated schema events */ public static SchemaValidationException outdatedEvent( TableIdentifier tableIdentifier, String jobId, long eventTime, long lastProcessedTime) { String message = String.format( "Schema change event is outdated. Event time: %d, last processed: %d", eventTime, lastProcessedTime); return new SchemaValidationException( SchemaEvolutionErrorCode.OUTDATED_SCHEMA_EVENT, message, tableIdentifier, jobId); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/exception/SinkWriterSchemaException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.exception; import org.apache.seatunnel.api.table.catalog.TableIdentifier; /** * Exception thrown when sink writer schema operations fail, includes schema application failures. */ public class SinkWriterSchemaException extends SchemaEvolutionException { public SinkWriterSchemaException( SchemaEvolutionErrorCode errorCode, String errorMessage, TableIdentifier tableIdentifier, String jobId, Throwable cause) { super(errorCode, errorMessage, tableIdentifier, jobId, cause); } /** Create an exception for schema application failures */ public static SinkWriterSchemaException applicationFailed( TableIdentifier tableIdentifier, String jobId, String reason, Throwable cause) { String message = String.format("Failed to apply schema change: %s", reason); return new SinkWriterSchemaException( SchemaEvolutionErrorCode.SCHEMA_CHANGE_APPLICATION_FAILED, message, tableIdentifier, jobId, cause); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/AlterTableEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.schema.event.AlterTableAddColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableChangeColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnsEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableDropColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableModifyColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableNameEvent; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; /** @deprecated instead by {@link AlterTableSchemaEventHandler} */ @Deprecated public class AlterTableEventHandler implements DataTypeChangeEventHandler { private SeaTunnelRowType dataType; @Override public SeaTunnelRowType get() { return dataType; } @Override public DataTypeChangeEventHandler reset(SeaTunnelRowType dataType) { this.dataType = dataType; return this; } @Override public SeaTunnelRowType apply(SchemaChangeEvent event) { AlterTableEvent alterTableEvent = (AlterTableEvent) event; return apply(dataType, alterTableEvent); } private SeaTunnelRowType apply(SeaTunnelRowType dataType, AlterTableEvent alterTableEvent) { if (alterTableEvent instanceof AlterTableNameEvent) { return dataType; } if (alterTableEvent instanceof AlterTableDropColumnEvent) { return applyDropColumn(dataType, (AlterTableDropColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableModifyColumnEvent) { return applyModifyColumn(dataType, (AlterTableModifyColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableChangeColumnEvent) { return applyChangeColumn(dataType, (AlterTableChangeColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableAddColumnEvent) { return applyAddColumn(dataType, (AlterTableAddColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableColumnsEvent) { SeaTunnelRowType newType = dataType; for (AlterTableColumnEvent columnEvent : ((AlterTableColumnsEvent) alterTableEvent).getEvents()) { newType = apply(newType, columnEvent); } return newType; } throw new UnsupportedOperationException( "Unsupported alter table event: " + alterTableEvent); } private SeaTunnelRowType applyAddColumn( SeaTunnelRowType dataType, AlterTableAddColumnEvent addColumnEvent) { LinkedList originFields = new LinkedList<>(Arrays.asList(dataType.getFieldNames())); LinkedList> originFieldTypes = new LinkedList<>(Arrays.asList(dataType.getFieldTypes())); Column column = addColumnEvent.getColumn(); if (originFields.contains(column.getName())) { return applyModifyColumn( dataType, new AlterTableModifyColumnEvent( addColumnEvent.tableIdentifier(), addColumnEvent.getColumn(), addColumnEvent.isFirst(), addColumnEvent.getAfterColumn())); } if (addColumnEvent.isFirst()) { originFields.addFirst(column.getName()); originFieldTypes.addFirst(column.getDataType()); } else if (addColumnEvent.getAfterColumn() != null) { int index = originFields.indexOf(addColumnEvent.getAfterColumn()); originFields.add(index + 1, column.getName()); originFieldTypes.add(index + 1, column.getDataType()); } else { originFields.addLast(column.getName()); originFieldTypes.addLast(column.getDataType()); } return new SeaTunnelRowType( originFields.toArray(new String[0]), originFieldTypes.toArray(new SeaTunnelDataType[0])); } private SeaTunnelRowType applyDropColumn( SeaTunnelRowType dataType, AlterTableDropColumnEvent dropColumnEvent) { List fieldNames = new ArrayList<>(); List fieldTypes = new ArrayList<>(); for (int i = 0; i < dataType.getTotalFields(); i++) { if (dataType.getFieldName(i).equals(dropColumnEvent.getColumn())) { continue; } fieldNames.add(dataType.getFieldName(i)); fieldTypes.add(dataType.getFieldType(i)); } return new SeaTunnelRowType( fieldNames.toArray(new String[0]), fieldTypes.toArray(new SeaTunnelDataType[0])); } private SeaTunnelRowType applyModifyColumn( SeaTunnelRowType dataType, AlterTableModifyColumnEvent modifyColumnEvent) { List fieldNames = Arrays.asList(dataType.getFieldNames()); if (!fieldNames.contains(modifyColumnEvent.getColumn().getName())) { return dataType; } String modifyColumnName = modifyColumnEvent.getColumn().getName(); int modifyColumnIndex = dataType.indexOf(modifyColumnName); return applyModifyColumn( dataType, modifyColumnIndex, modifyColumnEvent.getColumn(), modifyColumnEvent.isFirst(), modifyColumnEvent.getAfterColumn()); } private SeaTunnelRowType applyChangeColumn( SeaTunnelRowType dataType, AlterTableChangeColumnEvent changeColumnEvent) { String oldColumn = changeColumnEvent.getOldColumn(); int oldColumnIndex = dataType.indexOf(oldColumn); // The operation of rename column which only has the name of old column and the name of new // column, // so we need to fill the data type which is the same as the old column. SeaTunnelDataType fieldType = dataType.getFieldType(oldColumnIndex); Column column = changeColumnEvent.getColumn(); if (column.getDataType() == null) { column = column.copy(fieldType); } return applyModifyColumn( dataType, oldColumnIndex, column, changeColumnEvent.isFirst(), changeColumnEvent.getAfterColumn()); } private SeaTunnelRowType applyModifyColumn( SeaTunnelRowType dataType, int columnIndex, Column column, boolean first, String afterColumn) { LinkedList originFields = new LinkedList<>(Arrays.asList(dataType.getFieldNames())); LinkedList> originFieldTypes = new LinkedList<>(Arrays.asList(dataType.getFieldTypes())); if (first) { originFields.remove(columnIndex); originFieldTypes.remove(columnIndex); originFields.addFirst(column.getName()); originFieldTypes.addFirst(column.getDataType()); } else if (afterColumn != null) { originFields.remove(columnIndex); originFieldTypes.remove(columnIndex); int index = originFields.indexOf(afterColumn); originFields.add(index + 1, column.getName()); originFieldTypes.add(index + 1, column.getDataType()); } else { originFields.set(columnIndex, column.getName()); originFieldTypes.set(columnIndex, column.getDataType()); } return new SeaTunnelRowType( originFields.toArray(new String[0]), originFieldTypes.toArray(new SeaTunnelDataType[0])); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/AlterTableSchemaEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.schema.event.AlterTableAddColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableChangeColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnsEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableDropColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableModifyColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableNameEvent; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.stream.Collectors; public class AlterTableSchemaEventHandler implements TableSchemaChangeEventHandler { private TableSchema schema; @Override public TableSchema get() { return schema; } @Override public TableSchemaChangeEventHandler reset(TableSchema schema) { this.schema = schema; return this; } @Override public TableSchema apply(SchemaChangeEvent event) { AlterTableEvent alterTableEvent = (AlterTableEvent) event; return apply(schema, alterTableEvent); } private TableSchema apply(TableSchema schema, AlterTableEvent alterTableEvent) { if (alterTableEvent instanceof AlterTableNameEvent) { return schema; } if (alterTableEvent instanceof AlterTableDropColumnEvent) { return applyDropColumn(schema, (AlterTableDropColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableModifyColumnEvent) { return applyModifyColumn(schema, (AlterTableModifyColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableChangeColumnEvent) { return applyChangeColumn(schema, (AlterTableChangeColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableAddColumnEvent) { return applyAddColumn(schema, (AlterTableAddColumnEvent) alterTableEvent); } if (alterTableEvent instanceof AlterTableColumnsEvent) { TableSchema newSchema = schema; for (AlterTableColumnEvent columnEvent : ((AlterTableColumnsEvent) alterTableEvent).getEvents()) { newSchema = apply(newSchema, columnEvent); } return newSchema; } throw new UnsupportedOperationException( "Unsupported alter table event: " + alterTableEvent); } private TableSchema applyAddColumn( TableSchema schema, AlterTableAddColumnEvent addColumnEvent) { LinkedList originFields = new LinkedList<>(Arrays.asList(schema.getFieldNames())); Column column = addColumnEvent.getColumn(); if (originFields.contains(column.getName())) { return applyModifyColumn( schema, new AlterTableModifyColumnEvent( addColumnEvent.tableIdentifier(), addColumnEvent.getColumn(), addColumnEvent.isFirst(), addColumnEvent.getAfterColumn())); } LinkedList newColumns = new LinkedList<>(schema.getColumns()); if (addColumnEvent.isFirst()) { newColumns.addFirst(column); } else if (addColumnEvent.getAfterColumn() != null) { int index = originFields.indexOf(addColumnEvent.getAfterColumn()); newColumns.add(index + 1, column); } else { newColumns.addLast(column); } return TableSchema.builder() .columns(newColumns) .primaryKey(schema.getPrimaryKey()) .constraintKey(schema.getConstraintKeys()) .build(); } private TableSchema applyDropColumn( TableSchema schema, AlterTableDropColumnEvent dropColumnEvent) { List newColumns = schema.getColumns().stream() .filter(c -> !c.getName().equals(dropColumnEvent.getColumn())) .collect(Collectors.toList()); return TableSchema.builder() .columns(newColumns) .primaryKey(schema.getPrimaryKey()) .constraintKey(schema.getConstraintKeys()) .build(); } private TableSchema applyModifyColumn( TableSchema schema, AlterTableModifyColumnEvent modifyColumnEvent) { List fieldNames = Arrays.asList(schema.getFieldNames()); Column modifyColumn = modifyColumnEvent.getColumn(); if (!fieldNames.contains(modifyColumn.getName())) { return schema; } String modifyColumnName = modifyColumn.getName(); int modifyColumnIndex = fieldNames.indexOf(modifyColumnName); Column oldColumn = schema.getColumns().get(modifyColumnIndex); String oldColumnSourceType = oldColumn.getSourceType(); String modifyColumnSourceType = modifyColumn.getSourceType(); if (StringUtils.isNoneEmpty(oldColumnSourceType) && StringUtils.isNoneEmpty(modifyColumnSourceType) && !oldColumnSourceType.split("\\(")[0].equals( modifyColumnSourceType.split("\\(")[0])) { modifyColumnEvent.setTypeChanged(true); } return applyModifyColumn( schema, modifyColumnIndex, modifyColumn, modifyColumnEvent.isFirst(), modifyColumnEvent.getAfterColumn()); } private TableSchema applyChangeColumn( TableSchema schema, AlterTableChangeColumnEvent changeColumnEvent) { String oldColumn = changeColumnEvent.getOldColumn(); int oldColumnIndex = schema.indexOf(oldColumn); // The operation of rename column which only has the name of old column and the name of new // column, // so we need to fill the data type which is the same as the old column. Column column = changeColumnEvent.getColumn(); if (column.getDataType() == null) { SeaTunnelDataType fieldType = schema.getColumn(oldColumn).getDataType(); column = column.copy(fieldType); } return applyModifyColumn( schema, oldColumnIndex, column, changeColumnEvent.isFirst(), changeColumnEvent.getAfterColumn()); } private TableSchema applyModifyColumn( TableSchema schema, int columnIndex, Column column, boolean first, String afterColumn) { LinkedList originColumns = new LinkedList<>(schema.getColumns()); if (first) { originColumns.remove(columnIndex); originColumns.addFirst(column); } else if (afterColumn != null) { originColumns.remove(columnIndex); int index = originColumns.stream() .filter(c -> c.getName().equals(afterColumn)) .findFirst() .map(originColumns::indexOf) .get(); originColumns.add(index + 1, column); } else { originColumns.set(columnIndex, column); } return TableSchema.builder() .columns(originColumns) .primaryKey(schema.getPrimaryKey()) .constraintKey(schema.getConstraintKeys()) .build(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/DataTypeChangeEventDispatcher.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.schema.event.AlterTableAddColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableChangeColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnsEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableDropColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableModifyColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableNameEvent; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import lombok.extern.slf4j.Slf4j; import java.util.HashMap; import java.util.Map; /** @deprecated instead by {@link TableSchemaChangeEventDispatcher} */ @Deprecated @Slf4j public class DataTypeChangeEventDispatcher implements DataTypeChangeEventHandler { private final Map handlers; private SeaTunnelRowType dataType; public DataTypeChangeEventDispatcher() { this.handlers = createHandlers(); } @Override public SeaTunnelRowType get() { return dataType; } @Override public DataTypeChangeEventHandler reset(SeaTunnelRowType dataType) { this.dataType = dataType; return this; } @Override public SeaTunnelRowType apply(SchemaChangeEvent event) { DataTypeChangeEventHandler handler = handlers.get(event.getClass()); if (handler == null) { log.warn("No DataTypeChangeEventHandler for event: {}", event.getClass()); return dataType; } return handler.reset(dataType).apply(event); } private static Map createHandlers() { Map handlers = new HashMap<>(); AlterTableEventHandler alterTableEventHandler = new AlterTableEventHandler(); handlers.put(AlterTableEvent.class, alterTableEventHandler); handlers.put(AlterTableNameEvent.class, alterTableEventHandler); handlers.put(AlterTableColumnsEvent.class, alterTableEventHandler); handlers.put(AlterTableAddColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableModifyColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableDropColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableChangeColumnEvent.class, alterTableEventHandler); return handlers; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/DataTypeChangeEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; /** @deprecated instead by {@link TableSchemaChangeEventHandler} */ @Deprecated public interface DataTypeChangeEventHandler extends SchemaChangeEventHandler { SeaTunnelRowType get(); DataTypeChangeEventHandler reset(SeaTunnelRowType dataType); default SeaTunnelRowType handle(SchemaChangeEvent event) { if (get() == null) { throw new IllegalStateException("DataTypeChanger not reset"); } try { return apply(event); } finally { reset(null); if (get() != null) { throw new IllegalStateException("DataTypeChanger not reset"); } } } SeaTunnelRowType apply(SchemaChangeEvent event); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/SchemaChangeEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import java.io.Serializable; public interface SchemaChangeEventHandler extends Serializable { T handle(SchemaChangeEvent event); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/TableSchemaChangeEventDispatcher.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.schema.event.AlterTableAddColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableChangeColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableColumnsEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableDropColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableModifyColumnEvent; import org.apache.seatunnel.api.table.schema.event.AlterTableNameEvent; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import lombok.extern.slf4j.Slf4j; import java.util.HashMap; import java.util.Map; @Slf4j public class TableSchemaChangeEventDispatcher implements TableSchemaChangeEventHandler { private final Map handlers; private TableSchema schema; public TableSchemaChangeEventDispatcher() { this.handlers = createHandlers(); } @Override public TableSchema get() { return schema; } @Override public TableSchemaChangeEventHandler reset(TableSchema schema) { this.schema = schema; return this; } @Override public TableSchema apply(SchemaChangeEvent event) { TableSchemaChangeEventHandler handler = handlers.get(event.getClass()); if (handler == null) { log.warn("Not found handler for event: {}", event.getClass()); return schema; } return handler.reset(schema).apply(event); } private static Map createHandlers() { Map handlers = new HashMap<>(); AlterTableSchemaEventHandler alterTableEventHandler = new AlterTableSchemaEventHandler(); handlers.put(AlterTableEvent.class, alterTableEventHandler); handlers.put(AlterTableNameEvent.class, alterTableEventHandler); handlers.put(AlterTableColumnsEvent.class, alterTableEventHandler); handlers.put(AlterTableAddColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableModifyColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableDropColumnEvent.class, alterTableEventHandler); handlers.put(AlterTableChangeColumnEvent.class, alterTableEventHandler); return handlers; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/schema/handler/TableSchemaChangeEventHandler.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.handler; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; public interface TableSchemaChangeEventHandler extends SchemaChangeEventHandler { TableSchema get(); TableSchemaChangeEventHandler reset(TableSchema schema); default TableSchema handle(SchemaChangeEvent event) { if (get() == null) { throw new IllegalStateException("Handler not reset"); } try { return apply(event); } finally { reset(null); if (get() != null) { throw new IllegalStateException("Handler not reset"); } } } TableSchema apply(SchemaChangeEvent event); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/ArrayType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.common.exception.CommonError; import java.lang.reflect.Array; import java.util.Objects; public class ArrayType implements SeaTunnelDataType { private static final long serialVersionUID = 2L; public static final ArrayType STRING_ARRAY_TYPE = new ArrayType<>(String[].class, BasicType.STRING_TYPE); public static final ArrayType BOOLEAN_ARRAY_TYPE = new ArrayType<>(Boolean[].class, BasicType.BOOLEAN_TYPE); public static final ArrayType BYTE_ARRAY_TYPE = new ArrayType<>(Byte[].class, BasicType.BYTE_TYPE); public static final ArrayType SHORT_ARRAY_TYPE = new ArrayType<>(Short[].class, BasicType.SHORT_TYPE); public static final ArrayType INT_ARRAY_TYPE = new ArrayType<>(Integer[].class, BasicType.INT_TYPE); public static final ArrayType LONG_ARRAY_TYPE = new ArrayType<>(Long[].class, BasicType.LONG_TYPE); public static final ArrayType FLOAT_ARRAY_TYPE = new ArrayType<>(Float[].class, BasicType.FLOAT_TYPE); public static final ArrayType DOUBLE_ARRAY_TYPE = new ArrayType<>(Double[].class, BasicType.DOUBLE_TYPE); public static final ArrayType LOCAL_DATE_ARRAY_TYPE = new ArrayType(LocalTimeType[].class, LocalTimeType.LOCAL_DATE_TYPE); public static final ArrayType LOCAL_TIME_ARRAY_TYPE = new ArrayType(LocalTimeType[].class, LocalTimeType.LOCAL_TIME_TYPE); public static final ArrayType LOCAL_DATE_TIME_ARRAY_TYPE = new ArrayType(LocalTimeType[].class, LocalTimeType.LOCAL_DATE_TIME_TYPE); public static final ArrayType OFFSET_DATE_TIME_ARRAY_TYPE = new ArrayType(LocalTimeType[].class, LocalTimeType.OFFSET_DATE_TIME_TYPE); // -------------------------------------------------------------------------------------------- private final Class arrayClass; private final SeaTunnelDataType elementType; public ArrayType(Class arrayClass, SeaTunnelDataType elementType) { this.arrayClass = arrayClass; this.elementType = elementType; } @SuppressWarnings("unchecked") public static ArrayType of(SeaTunnelDataType elementType) { if (elementType == null) { throw CommonError.illegalArgument("elementType is null", "create ArrayType"); } Class arrayClass = (Class) toArrayClass(elementType); return new ArrayType<>(arrayClass, elementType); } private static Class toArrayClass(SeaTunnelDataType elementType) { Class elementClass = elementType.getTypeClass(); return Array.newInstance(elementClass, 0).getClass(); } public SeaTunnelDataType getElementType() { return elementType; } @Override public Class getTypeClass() { return arrayClass; } @Override public SqlType getSqlType() { return SqlType.ARRAY; } @Override public int hashCode() { return Objects.hash(arrayClass, elementType); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } if (!(obj instanceof ArrayType)) { return false; } ArrayType that = (ArrayType) obj; return Objects.equals(arrayClass, that.arrayClass) && Objects.equals(elementType, that.elementType); } @Override public String toString() { return String.format("ARRAY<%s>", elementType); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/BasicType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.time.OffsetDateTime; import java.util.Objects; public class BasicType implements SeaTunnelDataType { private static final long serialVersionUID = 2L; public static final BasicType STRING_TYPE = new BasicType<>(String.class, SqlType.STRING); public static final BasicType BOOLEAN_TYPE = new BasicType<>(Boolean.class, SqlType.BOOLEAN); public static final BasicType BYTE_TYPE = new BasicType<>(Byte.class, SqlType.TINYINT); public static final BasicType SHORT_TYPE = new BasicType<>(Short.class, SqlType.SMALLINT); public static final BasicType INT_TYPE = new BasicType<>(Integer.class, SqlType.INT); public static final BasicType LONG_TYPE = new BasicType<>(Long.class, SqlType.BIGINT); public static final BasicType FLOAT_TYPE = new BasicType<>(Float.class, SqlType.FLOAT); public static final BasicType DOUBLE_TYPE = new BasicType<>(Double.class, SqlType.DOUBLE); public static final BasicType VOID_TYPE = new BasicType<>(Void.class, SqlType.NULL); public static final LocalTimeType OFFSET_DATE_TIME_TYPE = LocalTimeType.OFFSET_DATE_TIME_TYPE; // -------------------------------------------------------------------------------------------- /** The physical type class. */ private final Class typeClass; private final SqlType sqlType; protected BasicType(Class typeClass, SqlType sqlType) { this.typeClass = typeClass; this.sqlType = sqlType; } @Override public Class getTypeClass() { return this.typeClass; } @Override public SqlType getSqlType() { return this.sqlType; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof BasicType)) { return false; } BasicType that = (BasicType) obj; return Objects.equals(typeClass, that.typeClass) && Objects.equals(sqlType, that.sqlType); } @Override public int hashCode() { return Objects.hash(typeClass, sqlType); } @Override public String toString() { return sqlType.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/CommonOptions.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.api.table.catalog.Column; import lombok.Getter; /** * Common option keys of SeaTunnel {@link Column#getOptions()} / {@link SeaTunnelRow#getOptions()}. * Used to store some extra information of the column value. */ @Getter public enum CommonOptions { /** * The key of {@link Column#getOptions()} to specify the column value is a json format string. */ JSON("Json", false), /** The key of {@link Column#getOptions()} to specify the column value is a metadata field. */ METADATA("Metadata", false), /** * The key of {@link SeaTunnelRow#getOptions()} to store the partition value of the row value. */ PARTITION("Partition", true), /** * The key of {@link SeaTunnelRow#getOptions()} to store the DATABASE value of the row value. */ DATABASE("Database", true), /** The key of {@link SeaTunnelRow#getOptions()} to store the TABLE value of the row value. */ TABLE("Table", true), /** * The key of {@link SeaTunnelRow#getOptions()} to store the ROW_KIND value of the row value. */ ROW_KIND("RowKind", true), /** * The key of {@link SeaTunnelRow#getOptions()} to store the EVENT_TIME value of the row value. * And the data should be milliseconds. */ EVENT_TIME("EventTime", true), /** * The key of {@link SeaTunnelRow#getOptions()} to store the DELAY value of the row value. And * the data should be milliseconds. */ DELAY("Delay", true), /** * The key of {@link SeaTunnelRow#getOptions()} to indicate whether the row represents a * complete file. */ IS_COMPLETE("is_complete", true), /** * The key of {@link SeaTunnelRow#getOptions()} to indicate whether the row contains binary * format data. */ IS_BINARY_FORMAT("is_binary_format", true); private final String name; private final boolean supportMetadataTrans; CommonOptions(String name, boolean supportMetadataTrans) { this.name = name; this.supportMetadataTrans = supportMetadataTrans; } public static CommonOptions fromName(String name) { for (CommonOptions option : CommonOptions.values()) { if (option.getName().equals(name)) { return option; } } throw new IllegalArgumentException("Unknown option name: " + name); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/CompositeType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.util.List; public interface CompositeType extends SeaTunnelDataType { List> getChildren(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/DecimalArrayType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; public class DecimalArrayType extends ArrayType { private static final long serialVersionUID = 1L; public static final Class arrayClass = DecimalType[].class; public DecimalArrayType(DecimalType elementType) { super(arrayClass, elementType); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/DecimalType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.math.BigDecimal; import java.util.Objects; public final class DecimalType extends BasicType { private static final long serialVersionUID = 1L; private final int precision; private final int scale; public DecimalType(int precision, int scale) { super(BigDecimal.class, SqlType.DECIMAL); this.precision = precision; this.scale = scale; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof DecimalType)) { return false; } DecimalType that = (DecimalType) o; return this.precision == that.precision && this.scale == that.scale; } @Override public int hashCode() { return Objects.hash(precision, scale); } @Override public String toString() { return String.format("Decimal(%d, %d)", precision, scale); } public int getPrecision() { return precision; } public int getScale() { return scale; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/LocalTimeType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.temporal.Temporal; import java.util.Objects; public class LocalTimeType implements SeaTunnelDataType { private static final long serialVersionUID = 2L; public static final LocalTimeType LOCAL_DATE_TYPE = new LocalTimeType<>(LocalDate.class, SqlType.DATE); public static final LocalTimeType LOCAL_TIME_TYPE = new LocalTimeType<>(LocalTime.class, SqlType.TIME); public static final LocalTimeType LOCAL_DATE_TIME_TYPE = new LocalTimeType<>(LocalDateTime.class, SqlType.TIMESTAMP); public static final LocalTimeType OFFSET_DATE_TIME_TYPE = new LocalTimeType<>(OffsetDateTime.class, SqlType.TIMESTAMP_TZ); private final Class typeClass; private final SqlType sqlType; private LocalTimeType(Class typeClass, SqlType sqlType) { this.typeClass = typeClass; this.sqlType = sqlType; } @Override public Class getTypeClass() { return typeClass; } @Override public SqlType getSqlType() { return this.sqlType; } @Override public int hashCode() { return Objects.hash(typeClass); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } if (!(obj instanceof LocalTimeType)) { return false; } LocalTimeType that = (LocalTimeType) obj; return Objects.equals(typeClass, that.typeClass); } @Override public String toString() { return sqlType.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/MapType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.shade.com.google.common.collect.Lists; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Objects; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public class MapType implements CompositeType> { private static final List SUPPORTED_KEY_TYPES = Arrays.asList( SqlType.NULL, SqlType.BOOLEAN, SqlType.TINYINT, SqlType.SMALLINT, SqlType.INT, SqlType.BIGINT, SqlType.DATE, SqlType.TIME, SqlType.TIMESTAMP, SqlType.TIMESTAMP_TZ, SqlType.FLOAT, SqlType.DOUBLE, SqlType.STRING, SqlType.DECIMAL); private final SeaTunnelDataType keyType; private final SeaTunnelDataType valueType; public MapType(SeaTunnelDataType keyType, SeaTunnelDataType valueType) { checkNotNull(keyType, "The key type is required."); checkNotNull(valueType, "The value type is required."); checkArgument( SUPPORTED_KEY_TYPES.contains(keyType.getSqlType()), "Unsupported key types: %s", keyType); this.keyType = keyType; this.valueType = valueType; } public SeaTunnelDataType getKeyType() { return keyType; } public SeaTunnelDataType getValueType() { return valueType; } @SuppressWarnings("unchecked") @Override public Class> getTypeClass() { return (Class>) (Class) Map.class; } @Override public SqlType getSqlType() { return SqlType.MAP; } @Override public List> getChildren() { return Lists.newArrayList(this.keyType, this.valueType); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } if (!(obj instanceof MapType)) { return false; } MapType that = (MapType) obj; return Objects.equals(keyType, that.keyType) && Objects.equals(valueType, that.valueType); } @Override public int hashCode() { return Objects.hash(keyType, valueType); } @Override public String toString() { return String.format("Map<%s, %s>", keyType, valueType); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/MetadataUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.api.table.catalog.TablePath; import java.util.ArrayList; import java.util.List; import java.util.stream.Stream; import static org.apache.seatunnel.api.table.type.CommonOptions.DELAY; import static org.apache.seatunnel.api.table.type.CommonOptions.EVENT_TIME; import static org.apache.seatunnel.api.table.type.CommonOptions.IS_BINARY_FORMAT; import static org.apache.seatunnel.api.table.type.CommonOptions.IS_COMPLETE; import static org.apache.seatunnel.api.table.type.CommonOptions.PARTITION; public class MetadataUtil { public static final List METADATA_FIELDS; static { METADATA_FIELDS = new ArrayList<>(); Stream.of(CommonOptions.values()) .filter(CommonOptions::isSupportMetadataTrans) .map(CommonOptions::getName) .forEach(METADATA_FIELDS::add); } public static void setDelay(SeaTunnelRow row, Long delay) { row.getOptions().put(DELAY.getName(), delay); } public static void setPartition(SeaTunnelRow row, String[] partition) { row.getOptions().put(PARTITION.getName(), partition); } public static void setEventTime(SeaTunnelRow row, Long delay) { row.getOptions().put(EVENT_TIME.getName(), delay); } public static void setBinaryRowComplete(SeaTunnelRow row) { row.getOptions().put(IS_COMPLETE.getName(), true); } public static void setBinaryFormat(SeaTunnelRow row) { row.getOptions().put(IS_BINARY_FORMAT.getName(), true); } public static boolean isComplete(Object row) { return checkOption(row, IS_COMPLETE.getName(), false); } public static boolean isBinaryFormat(Object row) { return checkOption(row, IS_BINARY_FORMAT.getName(), false); } public static String getDatabase(SeaTunnelRowAccessor row) { if (row.getTableId() == null) { return null; } return TablePath.of(row.getTableId()).getDatabaseName(); } public static String getTable(SeaTunnelRowAccessor row) { if (row.getTableId() == null) { return null; } return TablePath.of(row.getTableId()).getTableName(); } public static String getRowKind(SeaTunnelRowAccessor row) { return row.getRowKind().shortString(); } public static String[] getPartition(SeaTunnelRowAccessor row) { return (String[]) row.getOptions().get(PARTITION.getName()); } public static boolean isMetadataField(String fieldName) { return METADATA_FIELDS.contains(fieldName); } public static boolean checkOption(T row, String optionKey, boolean defaultValue) { if (row instanceof SeaTunnelRow) { return ((SeaTunnelRow) row) .getOptions() .getOrDefault(optionKey, defaultValue) .equals(true); } else if (row instanceof SeaTunnelRowAccessor) { return ((SeaTunnelRowAccessor) row) .getOptions() .getOrDefault(optionKey, defaultValue) .equals(true); } throw new IllegalArgumentException("Unsupported row type: " + row.getClass().getName()); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/MultipleRowType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import lombok.Getter; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; public class MultipleRowType implements SeaTunnelDataType, Iterable> { private final Map rowTypeMap; @Getter private String[] tableIds; public MultipleRowType(String[] tableIds, SeaTunnelRowType[] rowTypes) { Map rowTypeMap = new LinkedHashMap<>(); for (int i = 0; i < tableIds.length; i++) { rowTypeMap.put(tableIds[i], rowTypes[i]); } this.tableIds = tableIds; this.rowTypeMap = rowTypeMap; } public MultipleRowType(Map rowTypeMap) { this.tableIds = rowTypeMap.keySet().toArray(new String[0]); this.rowTypeMap = rowTypeMap; } public SeaTunnelRowType getRowType(String tableId) { return rowTypeMap.get(tableId); } @Override public Class getTypeClass() { return SeaTunnelRow.class; } @Override public SqlType getSqlType() { return SqlType.MULTIPLE_ROW; } @Override public Iterator> iterator() { return rowTypeMap.entrySet().iterator(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/PrimitiveByteArrayType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; public class PrimitiveByteArrayType implements SeaTunnelDataType { public static final PrimitiveByteArrayType INSTANCE = new PrimitiveByteArrayType(); private PrimitiveByteArrayType() {} @Override public Class getTypeClass() { return byte[].class; } @Override public SqlType getSqlType() { return SqlType.BYTES; } @Override public int hashCode() { return byte[].class.hashCode(); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } return obj instanceof PrimitiveByteArrayType; } @Override public String toString() { return SqlType.BYTES.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/Record.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.io.Serializable; /** Contain {@link SeaTunnelRow} or Checkpoint Barrier */ public class Record implements Serializable { private final T data; public Record(T data) { this.data = data; } public T getData() { return data; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/RowKind.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; /** Lists all kinds of changes that a row can describe in a changelog. */ public enum RowKind { // Note: Enums have no stable hash code across different JVMs, use toByteValue() for // this purpose. /** Insertion operation. */ INSERT("+I", (byte) 0), /** * Update operation with the previous content of the updated row. * *

    This kind SHOULD occur together with {@link #UPDATE_AFTER} for modelling an update that * needs to retract the previous row first. It is useful in cases of a non-idempotent update, * i.e., an update of a row that is not uniquely identifiable by a key. */ UPDATE_BEFORE("-U", (byte) 1), /** * Update operation with new content of the updated row. * *

    This kind CAN occur together with {@link #UPDATE_BEFORE} for modelling an update that * needs to retract the previous row first. OR it describes an idempotent update, i.e., an * update of a row that is uniquely identifiable by a key. */ UPDATE_AFTER("+U", (byte) 2), /** Deletion operation. */ DELETE("-D", (byte) 3); private final String shortString; private final byte value; /** * Creates a {@link RowKind} enum with the given short string and byte value representation of * the {@link RowKind}. */ RowKind(String shortString, byte value) { this.shortString = shortString; this.value = value; } /** * Returns a short string representation of this {@link RowKind}. * *

    * *

      *
    • "+I" represents {@link #INSERT}. *
    • "-U" represents {@link #UPDATE_BEFORE}. *
    • "+U" represents {@link #UPDATE_AFTER}. *
    • "-D" represents {@link #DELETE}. *
    */ public String shortString() { return shortString; } /** * Returns the byte value representation of this {@link RowKind}. The byte value is used for * serialization and deserialization. * *

    * *

      *
    • "0" represents {@link #INSERT}. *
    • "1" represents {@link #UPDATE_BEFORE}. *
    • "2" represents {@link #UPDATE_AFTER}. *
    • "3" represents {@link #DELETE}. *
    */ public byte toByteValue() { return value; } /** * Creates a {@link RowKind} from the given byte value. Each {@link RowKind} has a byte value * representation. * * @see #toByteValue() for mapping of byte value and {@link RowKind}. */ @SuppressWarnings("MagicNumber") public static RowKind fromByteValue(byte value) { switch (value) { case 0: return INSERT; case 1: return UPDATE_BEFORE; case 2: return UPDATE_AFTER; case 3: return DELETE; default: throw new UnsupportedOperationException( "Unsupported byte value '" + value + "' for row kind."); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelDataType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.io.Serializable; /** Logic data type of column in SeaTunnel. */ public interface SeaTunnelDataType extends Serializable { /** Gets the class of the type represented by this data type. */ Class getTypeClass(); /** Gets the SQL standard type represented by this data type. */ SqlType getSqlType(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRow.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.io.Serializable; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Objects; /** SeaTunnel row type. */ public final class SeaTunnelRow implements Serializable { private static final long serialVersionUID = -1L; /** Table identifier. */ private String tableId = ""; /** The kind of change that a row describes in a changelog. */ private RowKind rowKind = RowKind.INSERT; /** The array to store the actual internal format values. */ private final Object[] fields; private Map options; private volatile int size; public SeaTunnelRow(int arity) { this.fields = new Object[arity]; } public SeaTunnelRow(Object[] fields) { this.fields = fields; } public void setField(int pos, Object value) { this.fields[pos] = value; } public void setTableId(String tableId) { this.tableId = tableId; } public void setRowKind(RowKind rowKind) { this.rowKind = rowKind; } public void setOptions(Map options) { this.options = options; } public int getArity() { return fields.length; } public String getTableId() { return tableId; } public RowKind getRowKind() { return this.rowKind; } public Map getOptions() { if (options == null) { options = new HashMap<>(); } return options; } public Object[] getFields() { return fields; } public Object getField(int pos) { return this.fields[pos]; } public SeaTunnelRow copy() { Object[] newFields = new Object[this.getArity()]; System.arraycopy(this.getFields(), 0, newFields, 0, newFields.length); SeaTunnelRow newRow = new SeaTunnelRow(newFields); newRow.setRowKind(this.getRowKind()); newRow.setTableId(this.getTableId()); newRow.setOptions(this.getOptions()); return newRow; } public SeaTunnelRow copy(int[] indexMapping) { Object[] newFields = new Object[indexMapping.length]; for (int i = 0; i < indexMapping.length; i++) { newFields[i] = this.fields[indexMapping[i]]; } SeaTunnelRow newRow = new SeaTunnelRow(newFields); newRow.setRowKind(this.getRowKind()); newRow.setTableId(this.getTableId()); newRow.setOptions(this.getOptions()); return newRow; } public boolean isNullAt(int pos) { return this.fields[pos] == null; } public int getBytesSize(SeaTunnelRowType rowType) { if (size == 0) { int s = 0; for (int i = 0; i < fields.length; i++) { s += getBytesForValue(fields[i], rowType.getFieldType(i)); } size = s; } return size; } /** faster version of {@link #getBytesSize(SeaTunnelRowType)}. */ private int getBytesForValue(Object v, SeaTunnelDataType dataType) { if (v == null) { return 0; } SqlType sqlType = dataType.getSqlType(); switch (sqlType) { case STRING: return ((String) v).length(); case BOOLEAN: case TINYINT: return 1; case SMALLINT: return 2; case INT: case FLOAT: return 4; case BIGINT: case DOUBLE: return 8; case DECIMAL: return 36; case NULL: return 0; case BYTES: return ((byte[]) v).length; case DATE: return 24; case TIME: return 12; case TIMESTAMP: case TIMESTAMP_TZ: return 48; case FLOAT_VECTOR: case FLOAT16_VECTOR: case BFLOAT16_VECTOR: case BINARY_VECTOR: return ((ByteBuffer) v).capacity(); case SPARSE_FLOAT_VECTOR: return ((Map) v).entrySet().size() * 8; case ARRAY: SeaTunnelDataType elementType = ((ArrayType) dataType).getElementType(); if (elementType instanceof DecimalType) { return ((Object[]) v).length * 36; } if (elementType instanceof LocalTimeType) { SqlType eleSqlType = elementType.getSqlType(); switch (eleSqlType) { case DATE: return ((Object[]) v).length * 24; case TIME: return ((Object[]) v).length * 12; case TIMESTAMP: case TIMESTAMP_TZ: return ((Object[]) v).length * 48; default: throw new UnsupportedOperationException( "Unsupported type in LocalTimeArrayType: " + eleSqlType); } } return getBytesForArray(v, ((ArrayType) dataType).getElementType()); case MAP: int size = 0; MapType mapType = ((MapType) dataType); for (Map.Entry entry : ((Map) v).entrySet()) { size += getBytesForValue(entry.getKey(), mapType.getKeyType()) + getBytesForValue(entry.getValue(), mapType.getValueType()); } return size; case ROW: int rowSize = 0; SeaTunnelRowType rowType = ((SeaTunnelRowType) dataType); SeaTunnelDataType[] types = rowType.getFieldTypes(); SeaTunnelRow row = (SeaTunnelRow) v; for (int i = 0; i < types.length; i++) { rowSize += getBytesForValue(row.fields[i], types[i]); } return rowSize; default: throw new UnsupportedOperationException("Unsupported type: " + sqlType); } } private int getBytesForArray(Object v, SeaTunnelDataType dataType) { switch (dataType.getSqlType()) { case STRING: int s = 0; for (String i : ((String[]) v)) { s += i == null ? 0 : i.length(); } return s; case BOOLEAN: return getArrayNotNullSize((Boolean[]) v); case TINYINT: return getArrayNotNullSize((Byte[]) v); case SMALLINT: return getArrayNotNullSize((Short[]) v) * 2; case INT: return getArrayNotNullSize((Integer[]) v) * 4; case FLOAT: return getArrayNotNullSize((Float[]) v) * 4; case BIGINT: return getArrayNotNullSize((Long[]) v) * 8; case DOUBLE: return getArrayNotNullSize((Double[]) v) * 8; case ARRAY: int total = 0; for (Object elem : (Object[]) v) { total += getBytesForValue(elem, dataType); } return total; case MAP: return getArrayMapNotNullSize(v); case NULL: default: return 0; } } private int getArrayNotNullSize(Object[] values) { int c = 0; for (Object value : values) { if (value != null) { c++; } } return c; } private int getArrayMapNotNullSize(Object v) { int size = 0; if (Objects.nonNull(v)) { for (Map o : (Map[]) v) { for (Map.Entry entry : ((Map) o).entrySet()) { size += getBytesForValue(entry.getKey()) + getBytesForValue(entry.getValue()); } } } return size; } public int getBytesSize() { if (size == 0) { int s = 0; for (Object field : fields) { s += getBytesForValue(field); } size = s; } return size; } private int getBytesForValue(Object v) { if (v == null) { return 0; } String clazz = v.getClass().getSimpleName(); switch (clazz) { case "String": return ((String) v).length(); case "Boolean": case "Byte": return 1; case "Short": return 2; case "Integer": case "Float": return 4; case "Long": case "Double": return 8; case "BigDecimal": return 36; case "byte[]": return ((byte[]) v).length; case "LocalDate": return 24; case "LocalTime": return 12; case "LocalDateTime": case "OffsetDateTime": return 48; case "String[]": return getBytesForArray(v, BasicType.STRING_TYPE); case "Boolean[]": return getBytesForArray(v, BasicType.BOOLEAN_TYPE); case "Byte[]": return getBytesForArray(v, BasicType.BYTE_TYPE); case "Short[]": return getBytesForArray(v, BasicType.SHORT_TYPE); case "Integer[]": return getBytesForArray(v, BasicType.INT_TYPE); case "Long[]": return getBytesForArray(v, BasicType.LONG_TYPE); case "Float[]": return getBytesForArray(v, BasicType.FLOAT_TYPE); case "Double[]": return getBytesForArray(v, BasicType.DOUBLE_TYPE); case "Map[]": return getBytesForArray( v, new MapType<>(BasicType.STRING_TYPE, BasicType.INT_TYPE)); case "HashMap": case "LinkedHashMap": int size = 0; for (Map.Entry entry : ((Map) v).entrySet()) { size += getBytesForValue(entry.getKey()) + getBytesForValue(entry.getValue()); } return size; case "HeapByteBuffer": case "ByteBuffer": return ((ByteBuffer) v).capacity(); case "SeaTunnelRow": int rowSize = 0; SeaTunnelRow row = (SeaTunnelRow) v; for (int i = 0; i < row.fields.length; i++) { rowSize += getBytesForValue(row.fields[i]); } return rowSize; default: if (v.getClass().isArray() && v instanceof Object[]) { int sum = 0; for (Object o : (Object[]) v) { sum += getBytesForValue(o); } return sum; } if (v instanceof Map) { int mapSize = 0; for (Map.Entry entry : ((Map) v).entrySet()) { mapSize += getBytesForValue(entry.getKey()) + getBytesForValue(entry.getValue()); } return mapSize; } throw new UnsupportedOperationException("Unsupported type: " + clazz); } } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof SeaTunnelRow)) { return false; } SeaTunnelRow that = (SeaTunnelRow) o; return Objects.equals(tableId, that.tableId) && rowKind == that.rowKind && Arrays.deepEquals(fields, that.fields); } @Override public int hashCode() { int result = Objects.hash(tableId, rowKind); result = 31 * result + Arrays.deepHashCode(fields); return result; } @Override public String toString() { return "SeaTunnelRow{" + "tableId=" + tableId + ", kind=" + rowKind.shortString() + ", fields=" + Arrays.toString(fields) + '}'; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowAccessor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import lombok.AllArgsConstructor; import java.util.Map; @AllArgsConstructor public class SeaTunnelRowAccessor { private final SeaTunnelRow row; public int getArity() { return row.getArity(); } public String getTableId() { return row.getTableId(); } public RowKind getRowKind() { return row.getRowKind(); } public Object getField(int pos) { return row.getField(pos); } public Object[] getFields() { return row.getFields(); } public Map getOptions() { return row.getOptions(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import java.util.Arrays; import java.util.List; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class SeaTunnelRowType implements CompositeType { private static final long serialVersionUID = 2L; /** The field name of the {@link SeaTunnelRow}. */ private final String[] fieldNames; /** The type of the field. */ private final SeaTunnelDataType[] fieldTypes; public SeaTunnelRowType(String[] fieldNames, SeaTunnelDataType[] fieldTypes) { checkArgument( fieldNames.length == fieldTypes.length, "The number of field names must be the same as the number of field types."); this.fieldNames = fieldNames; this.fieldTypes = fieldTypes; } @Override public Class getTypeClass() { return SeaTunnelRow.class; } @Override public SqlType getSqlType() { return SqlType.ROW; } public String[] getFieldNames() { return fieldNames; } public SeaTunnelDataType[] getFieldTypes() { return fieldTypes; } @Override public List> getChildren() { return Arrays.asList(fieldTypes); } public int getTotalFields() { return fieldTypes.length; } public String getFieldName(int index) { return fieldNames[index]; } public SeaTunnelDataType getFieldType(int index) { return fieldTypes[index]; } public int indexOf(String fieldName) { return indexOf(fieldName, true); } public int indexOf(String fieldName, boolean throwExceptionWhenNotFound) { for (int i = 0; i < fieldNames.length; i++) { if (fieldNames[i].equals(fieldName)) { return i; } } if (throwExceptionWhenNotFound) { throw new IllegalArgumentException(String.format("can't find field [%s]", fieldName)); } else { return -1; } } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof SeaTunnelRowType)) { return false; } SeaTunnelRowType that = (SeaTunnelRowType) obj; return Arrays.equals(fieldNames, that.fieldNames) && Arrays.equals(fieldTypes, that.fieldTypes); } @Override public int hashCode() { int result = Arrays.hashCode(fieldNames); result = 31 * result + Arrays.hashCode(fieldTypes); return result; } @Override public String toString() { StringBuilder builder = new StringBuilder("ROW<"); for (int i = 0; i < fieldNames.length; i++) { if (i > 0) { builder.append(","); } builder.append(fieldNames[i]).append(" ").append(fieldTypes[i]); } return builder.append(">").toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SqlType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; /** The sql type of {@link SeaTunnelDataType}. */ public enum SqlType { ARRAY, MAP, STRING, BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, NULL, BYTES, DATE, TIME, TIMESTAMP, TIMESTAMP_TZ, BINARY_VECTOR, FLOAT_VECTOR, FLOAT16_VECTOR, BFLOAT16_VECTOR, SPARSE_FLOAT_VECTOR, ROW, MULTIPLE_ROW; } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/TypeUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; public class TypeUtil { /** Check if the data type can be converted to another data type. */ public static boolean canConvert(SeaTunnelDataType from, SeaTunnelDataType to) { // any type can be converted to string if (from == to || to.getSqlType() == SqlType.STRING) { return true; } if (from.getSqlType() == SqlType.TINYINT) { return to.getSqlType() == SqlType.SMALLINT || to.getSqlType() == SqlType.INT || to.getSqlType() == SqlType.BIGINT; } if (from.getSqlType() == SqlType.SMALLINT) { return to.getSqlType() == SqlType.INT || to.getSqlType() == SqlType.BIGINT; } if (from.getSqlType() == SqlType.INT) { return to.getSqlType() == SqlType.BIGINT; } if (from.getSqlType() == SqlType.FLOAT) { return to.getSqlType() == SqlType.DOUBLE || to.getSqlType() == SqlType.DECIMAL; } if (from.getSqlType() == SqlType.DOUBLE) { return to.getSqlType() == SqlType.DECIMAL; } return false; } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/VectorType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.api.annotation.Experimental; import java.nio.ByteBuffer; import java.util.Map; import java.util.Objects; /** * VectorType represents a vector type in SeaTunnel. * *

    Experimental feature, use with caution */ @Experimental public class VectorType implements SeaTunnelDataType { private static final long serialVersionUID = 2L; public static final VectorType VECTOR_FLOAT_TYPE = new VectorType<>(ByteBuffer.class, SqlType.FLOAT_VECTOR); public static final VectorType VECTOR_SPARSE_FLOAT_TYPE = new VectorType<>(Map.class, SqlType.SPARSE_FLOAT_VECTOR); public static final VectorType VECTOR_BINARY_TYPE = new VectorType<>(ByteBuffer.class, SqlType.BINARY_VECTOR); public static final VectorType VECTOR_FLOAT16_TYPE = new VectorType<>(ByteBuffer.class, SqlType.FLOAT16_VECTOR); public static final VectorType VECTOR_BFLOAT16_TYPE = new VectorType<>(ByteBuffer.class, SqlType.BFLOAT16_VECTOR); // -------------------------------------------------------------------------------------------- /** The physical type class. */ private final Class typeClass; private final SqlType sqlType; protected VectorType(Class typeClass, SqlType sqlType) { this.typeClass = typeClass; this.sqlType = sqlType; } @Override public Class getTypeClass() { return this.typeClass; } @Override public SqlType getSqlType() { return this.sqlType; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof VectorType)) { return false; } VectorType that = (VectorType) obj; return Objects.equals(typeClass, that.typeClass) && Objects.equals(sqlType, that.sqlType); } @Override public int hashCode() { return Objects.hash(typeClass, sqlType); } @Override public String toString() { return sqlType.toString(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCCallable.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.concurrent.Callable; import java.util.function.Supplier; /** * Callable that sets MDC context before calling the delegate and clears it afterwards. * * @param */ public class MDCCallable implements Callable { private final Supplier contextSupplier; private final Callable delegate; public MDCCallable(Callable delegate) { this(MDCContext.current(), delegate); } public MDCCallable(MDCContext context, Callable delegate) { this(() -> context, delegate); } public MDCCallable(Supplier contextSupplier, Callable delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public V call() throws Exception { try (MDCContext ignored = contextSupplier.get().activate()) { return delegate.call(); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCComparator.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.Comparator; import java.util.function.Supplier; public class MDCComparator implements Comparator { private final Supplier contextSupplier; private final Comparator delegate; public MDCComparator(Comparator delegate) { this(MDCContext.current(), delegate); } public MDCComparator(MDCContext context, Comparator delegate) { this(() -> context, delegate); } public MDCComparator(Supplier contextSupplier, Comparator delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public int compare(T o1, T o2) { try (MDCContext ignored = contextSupplier.get().activate()) { return delegate.compare(o1, o2); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCConsumer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.function.Consumer; import java.util.function.Supplier; public class MDCConsumer implements Consumer { private final Supplier contextSupplier; private final Consumer delegate; public MDCConsumer(Consumer delegate) { this(MDCContext.current(), delegate); } public MDCConsumer(MDCContext context, Consumer delegate) { this(() -> context, delegate); } public MDCConsumer(Supplier contextSupplier, Consumer delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public void accept(T t) { try (MDCContext ignored = contextSupplier.get().activate()) { delegate.accept(t); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCContext.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import org.slf4j.MDC; import lombok.EqualsAndHashCode; import lombok.extern.slf4j.Slf4j; import java.io.Closeable; import java.io.Serializable; /** * MDC context for tracing. * *

    reference: https://www.slf4j.org/manual.html#mdc * *

    Example: * *

     *     try (MDCContext ctx = MDCContext.of(jobId, pipelineId, taskId).activate()) {
     *          // do something
     *          new Thread(new MDCRunnable(MDCContext.current(), new Runnable() {
     *             @Override
     *             public void run() {
     *                  // do something
     *             }
     *          }))
     *          .start();
     *     }
     *     // MDC context will be restored after the try block
     * 
    */ @Slf4j @EqualsAndHashCode public class MDCContext implements Serializable, Closeable { private static final MDCContext EMPTY = new MDCContext(null, null, null); private static final String EMPTY_TO_STRING = "NA"; public static final String JOB_ID = "ST-JID"; public static final String PIPELINE_ID = "ST-PID"; public static final String TASK_ID = "ST-TID"; private final Long jobId; private final Long pipelineId; private final Long taskId; private transient volatile MDCContext toRestore; public MDCContext(Long jobId, Long pipelineId, Long taskId) { this.jobId = jobId; this.pipelineId = pipelineId; this.taskId = taskId; } public synchronized MDCContext activate() { if (this == EMPTY) { return this; } if (this.toRestore != null) { throw new IllegalStateException("MDCContext is already activated"); } this.toRestore = current(); try { if (jobId != null) { MDC.put(JOB_ID, String.valueOf(jobId)); } if (pipelineId != null) { MDC.put(PIPELINE_ID, String.valueOf(pipelineId)); } if (taskId != null) { MDC.put(TASK_ID, String.valueOf(taskId)); } } catch (Throwable e) { log.error("Failed to put MDC context", e); throw e; } return this; } public synchronized MDCContext deactivate() { if (this == EMPTY) { return this; } if (this.toRestore == null) { throw new IllegalStateException("MDCContext is not activated"); } try { MDC.remove(JOB_ID); MDC.remove(PIPELINE_ID); MDC.remove(TASK_ID); } catch (Throwable e) { log.error("Failed to clear MDC context", e); throw e; } if (this.toRestore != null) { this.toRestore.activate(); } return this; } @Override public void close() { deactivate(); } @Override public String toString() { if (this == EMPTY) { return EMPTY_TO_STRING; } return String.format( "%d/%d/%d", jobId, pipelineId == null ? 0 : pipelineId, taskId == null ? 0 : taskId); } public static MDCContext of(long jobId) { return new MDCContext(jobId, null, null); } public static MDCContext of(long jobId, long pipelineId) { return new MDCContext(jobId, pipelineId, null); } public static MDCContext of(long jobId, long pipelineId, long taskId) { return new MDCContext(jobId, pipelineId, taskId); } public static MDCContext of(MDCContext context) { return new MDCContext(context.jobId, context.pipelineId, context.taskId); } public static MDCContext current() { String jobId = MDC.get(JOB_ID); if (jobId == null) { return EMPTY; } String pipelineId = MDC.get(PIPELINE_ID); String taskId = MDC.get(TASK_ID); return new MDCContext( Long.parseLong(jobId), pipelineId != null ? Long.parseLong(pipelineId) : null, taskId != null ? Long.parseLong(taskId) : null); } public static MDCContext valueOf(String s) { if (EMPTY_TO_STRING.equals(s)) { return EMPTY; } String[] arr = s.split("/"); Long jobId = Long.parseLong(arr[0]); Long pipelineId = Long.parseLong(arr[1]); Long taskId = Long.parseLong(arr[2]); if (pipelineId == 0 || taskId == 0) { return MDCContext.of(jobId); } return MDCContext.of(jobId, pipelineId, taskId); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCExecutor.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.concurrent.Executor; /** Executor that sets MDC context before calling the delegate and clears it afterwards. */ public class MDCExecutor implements Executor { private final MDCContext context; private final Executor delegate; public MDCExecutor(MDCContext context, Executor delegate) { this.context = context; this.delegate = delegate; } @Override public void execute(Runnable command) { delegate.execute(new MDCRunnable(MDCContext.of(context), command)); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCExecutorService.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.Collection; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.stream.Collectors; /** ExecutorService that sets MDC context before calling the delegate and clears it afterwards. */ public class MDCExecutorService extends MDCExecutor implements ExecutorService { private final MDCContext context; private final ExecutorService delegate; public MDCExecutorService(MDCContext context, ExecutorService delegate) { super(context, delegate); this.context = context; this.delegate = delegate; } @Override public void shutdown() { delegate.shutdown(); } @Override public List shutdownNow() { return delegate.shutdownNow(); } @Override public boolean isShutdown() { return delegate.isShutdown(); } @Override public boolean isTerminated() { return delegate.isTerminated(); } @Override public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedException { return delegate.awaitTermination(timeout, unit); } @Override public Future submit(Callable task) { return delegate.submit(new MDCCallable<>(MDCContext.of(context), task)); } @Override public Future submit(Runnable task, T result) { return delegate.submit(new MDCRunnable(MDCContext.of(context), task), result); } @Override public Future submit(Runnable task) { return delegate.submit(new MDCRunnable(MDCContext.of(context), task)); } @Override public List> invokeAll(Collection> tasks) throws InterruptedException { return delegate.invokeAll( tasks.stream() .map(task -> new MDCCallable<>(MDCContext.of(context), task)) .collect(Collectors.toList())); } @Override public List> invokeAll( Collection> tasks, long timeout, TimeUnit unit) throws InterruptedException { return delegate.invokeAll( tasks.stream() .map(task -> new MDCCallable<>(MDCContext.of(context), task)) .collect(Collectors.toList()), timeout, unit); } @Override public T invokeAny(Collection> tasks) throws InterruptedException, ExecutionException { return delegate.invokeAny( tasks.stream() .map(task -> new MDCCallable<>(MDCContext.of(context), task)) .collect(Collectors.toList())); } @Override public T invokeAny(Collection> tasks, long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { return delegate.invokeAny( tasks.stream() .map(task -> new MDCCallable<>(MDCContext.of(context), task)) .collect(Collectors.toList()), timeout, unit); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCFunction.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.function.Function; import java.util.function.Supplier; public class MDCFunction implements Function { private final Supplier contextSupplier; protected final Function delegate; public MDCFunction(Function delegate) { this(MDCContext.current(), delegate); } public MDCFunction(MDCContext context, Function delegate) { this(() -> context, delegate); } public MDCFunction(Supplier contextSupplier, Function delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public R apply(T t) { try (MDCContext ignored = contextSupplier.get().activate()) { return delegate.apply(t); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCPredicate.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.function.Predicate; import java.util.function.Supplier; public class MDCPredicate implements Predicate { private final Supplier contextSupplier; private final Predicate delegate; public MDCPredicate(Predicate delegate) { this(MDCContext.current(), delegate); } public MDCPredicate(MDCContext context, Predicate delegate) { this(() -> context, delegate); } public MDCPredicate(Supplier contextSupplier, Predicate delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public boolean test(T t) { try (MDCContext ignored = contextSupplier.get().activate()) { return delegate.test(t); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCRunnable.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.function.Supplier; /** Runnable that sets MDC context before calling the delegate and clears it afterwards. */ public class MDCRunnable implements Runnable { private final Supplier contextSupplier; private final Runnable delegate; public MDCRunnable(Runnable delegate) { this(MDCContext.current(), delegate); } public MDCRunnable(MDCContext context, Runnable delegate) { this(() -> context, delegate); } public MDCRunnable(Supplier contextSupplier, Runnable delegate) { this.contextSupplier = contextSupplier; this.delegate = delegate; } @Override public void run() { try (MDCContext ignored = contextSupplier.get().activate()) { delegate.run(); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCScheduledExecutorService.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.concurrent.Callable; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; /** * ScheduledExecutorService that sets MDC context before calling the delegate and clears it * afterwards. */ public class MDCScheduledExecutorService extends MDCExecutorService implements ScheduledExecutorService { private final MDCContext context; private final ScheduledExecutorService delegate; public MDCScheduledExecutorService(MDCContext context, ScheduledExecutorService delegate) { super(context, delegate); this.context = context; this.delegate = delegate; } @Override public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) { return delegate.schedule( new MDCRunnable(() -> MDCContext.of(context), command), delay, unit); } @Override public ScheduledFuture schedule(Callable callable, long delay, TimeUnit unit) { return delegate.schedule( new MDCCallable<>(() -> MDCContext.of(context), callable), delay, unit); } @Override public ScheduledFuture scheduleAtFixedRate( Runnable command, long initialDelay, long period, TimeUnit unit) { return delegate.scheduleAtFixedRate( new MDCRunnable(() -> MDCContext.of(context), command), initialDelay, period, unit); } @Override public ScheduledFuture scheduleWithFixedDelay( Runnable command, long initialDelay, long delay, TimeUnit unit) { return delegate.scheduleWithFixedDelay( new MDCRunnable(() -> MDCContext.of(context), command), initialDelay, delay, unit); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCStream.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.Comparator; import java.util.Iterator; import java.util.Optional; import java.util.Spliterator; import java.util.function.BiConsumer; import java.util.function.BiFunction; import java.util.function.BinaryOperator; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.IntFunction; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.function.ToDoubleFunction; import java.util.function.ToIntFunction; import java.util.function.ToLongFunction; import java.util.stream.Collector; import java.util.stream.DoubleStream; import java.util.stream.IntStream; import java.util.stream.LongStream; import java.util.stream.Stream; public class MDCStream implements Stream { private final MDCContext context; private final Stream delegate; public MDCStream(Stream delegate) { this(MDCContext.current(), delegate); } public MDCStream(MDCContext context, Stream delegate) { this.context = context; this.delegate = delegate; } @Override public Stream filter(Predicate predicate) { return new MDCStream<>( context, delegate.filter(new MDCPredicate<>(() -> MDCContext.of(context), predicate))); } @Override public Stream map(Function mapper) { return new MDCStream<>( context, delegate.map(new MDCFunction<>(() -> MDCContext.of(context), mapper))); } @Override public Stream flatMap(Function> mapper) { return new MDCStream<>( context, delegate.flatMap(new MDCFunction<>(() -> MDCContext.of(context), mapper))); } @Override public Stream sorted(Comparator comparator) { return new MDCStream<>( context, delegate.sorted(new MDCComparator<>(() -> MDCContext.of(context), comparator))); } @Override public Stream peek(Consumer action) { return new MDCStream<>( context, delegate.peek(new MDCConsumer<>(() -> MDCContext.of(context), action))); } @Override public void forEach(Consumer action) { delegate.forEach(new MDCConsumer<>(() -> MDCContext.of(context), action)); } @Override public void forEachOrdered(Consumer action) { delegate.forEachOrdered(new MDCConsumer<>(() -> MDCContext.of(context), action)); } @Override public Optional min(Comparator comparator) { return delegate.min(new MDCComparator<>(() -> MDCContext.of(context), comparator)); } @Override public Optional max(Comparator comparator) { return delegate.max(new MDCComparator<>(() -> MDCContext.of(context), comparator)); } @Override public boolean anyMatch(Predicate predicate) { return delegate.anyMatch(new MDCPredicate<>(() -> MDCContext.of(context), predicate)); } @Override public boolean allMatch(Predicate predicate) { return delegate.allMatch(new MDCPredicate<>(() -> MDCContext.of(context), predicate)); } @Override public boolean noneMatch(Predicate predicate) { return delegate.noneMatch(new MDCPredicate<>(() -> MDCContext.of(context), predicate)); } @Override public Stream onClose(Runnable closeHandler) { return delegate.onClose(new MDCRunnable(context, closeHandler)); } @Override public Stream sequential() { return new MDCStream<>(context, delegate.sequential()); } @Override public Stream parallel() { return new MDCStream<>(context, delegate.parallel()); } @Override public Stream unordered() { return new MDCStream<>(context, delegate.unordered()); } @Override public Stream distinct() { return new MDCStream<>(context, delegate.distinct()); } @Override public Stream sorted() { return new MDCStream<>(context, delegate.sorted()); } @Override public Stream limit(long maxSize) { return new MDCStream<>(context, delegate.limit(maxSize)); } @Override public Stream skip(long n) { return new MDCStream<>(context, delegate.skip(n)); } @Override public IntStream flatMapToInt(Function mapper) { return delegate.flatMapToInt(new MDCFunction<>(() -> MDCContext.of(context), mapper)); } @Override public LongStream flatMapToLong(Function mapper) { return delegate.flatMapToLong(new MDCFunction<>(() -> MDCContext.of(context), mapper)); } @Override public DoubleStream flatMapToDouble(Function mapper) { return delegate.flatMapToDouble(new MDCFunction<>(() -> MDCContext.of(context), mapper)); } @Override public IntStream mapToInt(ToIntFunction mapper) { return delegate.mapToInt(mapper); } @Override public LongStream mapToLong(ToLongFunction mapper) { return delegate.mapToLong(mapper); } @Override public DoubleStream mapToDouble(ToDoubleFunction mapper) { return delegate.mapToDouble(mapper); } @Override public Object[] toArray() { return delegate.toArray(); } @Override public A[] toArray(IntFunction generator) { return delegate.toArray(generator); } @Override public T reduce(T identity, BinaryOperator accumulator) { return delegate.reduce(identity, accumulator); } @Override public Optional reduce(BinaryOperator accumulator) { return delegate.reduce(accumulator); } @Override public U reduce( U identity, BiFunction accumulator, BinaryOperator combiner) { return delegate.reduce(identity, accumulator, combiner); } @Override public R collect( Supplier supplier, BiConsumer accumulator, BiConsumer combiner) { return delegate.collect(supplier, accumulator, combiner); } @Override public R collect(Collector collector) { return delegate.collect(collector); } @Override public long count() { return delegate.count(); } @Override public Optional findFirst() { return delegate.findFirst(); } @Override public Optional findAny() { return delegate.findAny(); } @Override public Iterator iterator() { return delegate.iterator(); } @Override public Spliterator spliterator() { return delegate.spliterator(); } @Override public boolean isParallel() { return delegate.isParallel(); } @Override public void close() { delegate.close(); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCSupplier.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.function.Supplier; public class MDCSupplier implements Supplier { private final MDCContext context; private final Supplier delegate; public MDCSupplier(Supplier delegate) { this(MDCContext.current(), delegate); } public MDCSupplier(MDCContext context, Supplier delegate) { this.context = context; this.delegate = delegate; } @Override public T get() { try (MDCContext ignored = context.activate()) { return delegate.get(); } } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/tracing/MDCTracer.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import java.util.Comparator; import java.util.concurrent.Callable; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.ScheduledExecutorService; import java.util.function.Consumer; import java.util.function.Function; import java.util.function.Predicate; import java.util.function.Supplier; import java.util.stream.Stream; /** * Tracer for MDC context. * *

    It wraps the given {@link Runnable}, {@link Callable}, {@link Executor}, {@link * ExecutorService}, {@link ScheduledExecutorService} to trace the MDC context. * *

    It is useful to trace the MDC context in the asynchronous execution. For example, when you * submit a task to the {@link ExecutorService}, the MDC context is not propagated to the worker * thread. * *

    It is recommended to use the {@link MDCTracer} to wrap the task to trace the MDC context. * *

    {@code
     * MDCContext mdcContext = MDCContext.of(1);
     * ExecutorService executorService = Executors.newFixedThreadPool(10);
     * executorService.submit(MDCTracer.tracing(mdcContext, () -> {
     *    // Your task
     *    logger.info("Task is running");
     *    return null;
     *    }));
     *
     * }
    */ public class MDCTracer { public static MDCRunnable tracing(Runnable delegate) { return tracing(MDCContext.current(), delegate); } public static MDCRunnable tracing(Long jobId, Runnable delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCRunnable tracing(MDCContext context, Runnable delegate) { if (delegate instanceof MDCRunnable) { throw new IllegalArgumentException("Already an MDCRunnable"); } return new MDCRunnable(context, delegate); } public static MDCCallable tracing(Callable delegate) { return tracing(MDCContext.current(), delegate); } public static MDCCallable tracing(Long jobId, Callable delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCCallable tracing(MDCContext context, Callable delegate) { if (delegate instanceof MDCCallable) { throw new IllegalArgumentException("Already an MDCCallable"); } return new MDCCallable<>(context, delegate); } public static MDCExecutor tracing(Executor delegate) { return tracing(MDCContext.current(), delegate); } public static MDCExecutor tracing(Long jobId, Executor delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCExecutor tracing(MDCContext context, Executor delegate) { if (delegate instanceof MDCExecutor) { throw new IllegalArgumentException("Already an MDCExecutor"); } return new MDCExecutor(context, delegate); } public static MDCExecutorService tracing(ExecutorService delegate) { return tracing(MDCContext.current(), delegate); } public static MDCExecutorService tracing(Long jobId, ExecutorService delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCExecutorService tracing(MDCContext context, ExecutorService delegate) { if (delegate instanceof MDCExecutor) { throw new IllegalArgumentException("Already an MDCExecutor"); } return new MDCExecutorService(context, delegate); } public static MDCScheduledExecutorService tracing(ScheduledExecutorService delegate) { return tracing(MDCContext.current(), delegate); } public static MDCScheduledExecutorService tracing( Long jobId, ScheduledExecutorService delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCScheduledExecutorService tracing( MDCContext context, ScheduledExecutorService delegate) { if (delegate instanceof MDCExecutor) { throw new IllegalArgumentException("Already an MDCExecutor"); } return new MDCScheduledExecutorService(context, delegate); } public static MDCConsumer tracing(Consumer delegate) { return tracing(MDCContext.current(), delegate); } public static MDCConsumer tracing(Long jobId, Consumer delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCConsumer tracing(MDCContext context, Consumer delegate) { if (delegate instanceof MDCConsumer) { throw new IllegalArgumentException("Already an MDCConsumer"); } return new MDCConsumer<>(context, delegate); } public static MDCFunction tracing(Function delegate) { return tracing(MDCContext.current(), delegate); } public static MDCFunction tracing(Long jobId, Function delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCFunction tracing(MDCContext context, Function delegate) { if (delegate instanceof MDCFunction) { throw new IllegalArgumentException("Already an MDCFunction"); } return new MDCFunction<>(context, delegate); } public static MDCPredicate tracing(Predicate delegate) { return tracing(MDCContext.current(), delegate); } public static MDCPredicate tracing(Long jobId, Predicate delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCPredicate tracing(MDCContext context, Predicate delegate) { if (delegate instanceof MDCPredicate) { throw new IllegalArgumentException("Already an MDCPredicate"); } return new MDCPredicate<>(context, delegate); } public static MDCComparator tracing(Comparator delegate) { return tracing(MDCContext.current(), delegate); } public static MDCComparator tracing(Long jobId, Comparator delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCComparator tracing(MDCContext context, Comparator delegate) { if (delegate instanceof MDCComparator) { throw new IllegalArgumentException("Already an MDCComparator"); } return new MDCComparator<>(context, delegate); } public static MDCSupplier tracing(Supplier delegate) { return tracing(MDCContext.current(), delegate); } public static MDCSupplier tracing(Long jobId, Supplier delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCSupplier tracing(MDCContext context, Supplier delegate) { if (delegate instanceof MDCSupplier) { throw new IllegalArgumentException("Already an MDCSupplier"); } return new MDCSupplier<>(context, delegate); } public static MDCStream tracing(Stream delegate) { return tracing(MDCContext.current(), delegate); } public static MDCStream tracing(Long jobId, Stream delegate) { return tracing(MDCContext.of(jobId), delegate); } public static MDCStream tracing(MDCContext context, Stream delegate) { if (delegate instanceof MDCStream) { throw new IllegalArgumentException("Already an MDCStream"); } return new MDCStream<>(context, delegate); } } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/transform/Collector.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.transform; public interface Collector { /** * Emits a record. * * @param record The record to collect. */ void collect(T record); /** Closes the collector. If any data was buffered, that data will be flushed. */ void close(); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/transform/SeaTunnelFlatMapTransform.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.transform; import java.util.List; public interface SeaTunnelFlatMapTransform extends SeaTunnelTransform { /** * Transform input data to {@link this#getProducedCatalogTable().getSeaTunnelRowType()} types * data. * * @param row the data need be transformed. * @return transformed data. */ List flatMap(T row); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/transform/SeaTunnelMapTransform.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.transform; public interface SeaTunnelMapTransform extends SeaTunnelTransform { /** * Transform input data to {@link this#getProducedCatalogTable().getSeaTunnelRowType()} types * data. * * @param row the data need be transformed. * @return transformed data. */ T map(T row); } ================================================ FILE: seatunnel-api/src/main/java/org/apache/seatunnel/api/transform/SeaTunnelTransform.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.transform; import org.apache.seatunnel.api.common.PluginIdentifierInterface; import org.apache.seatunnel.api.source.SeaTunnelJobAware; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.schema.event.SchemaChangeEvent; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import java.io.Serializable; import java.util.List; public interface SeaTunnelTransform extends Serializable, PluginIdentifierInterface, SeaTunnelJobAware { /** call it when Transformer initialed */ default void open() {} /** * Set the data type info of input data. * * @deprecated instead by {@link org.apache.seatunnel.api.table.factory.Factory} * @param inputDataType The data type info of upstream input. */ @Deprecated default void setTypeInfo(SeaTunnelDataType inputDataType) { throw new UnsupportedOperationException("setTypeInfo method is not supported"); } /** Get the catalog table output by this transform */ CatalogTable getProducedCatalogTable(); List getProducedCatalogTables(); default SchemaChangeEvent mapSchemaChangeEvent(SchemaChangeEvent schemaChangeEvent) { return schemaChangeEvent; } /** call it when Transformer completed */ default void close() {} } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/OptionTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class OptionTest { public static final Option TEST_NUM = Options.key("option.num") .intType() .defaultValue(100) .withDescription("test int option"); public static final Option TEST_MODE = Options.key("option.mode") .enumType(TestMode.class) .defaultValue(TestMode.LATEST) .withDescription("test enum option"); public enum TestMode { EARLIEST, LATEST, TIMESTAMP, } @Test public void testEquals() { Assertions.assertEquals(TEST_NUM, Options.key("option.num").intType().defaultValue(100)); Assertions.assertEquals( TEST_MODE, Options.key("option.mode").enumType(TestMode.class).defaultValue(TestMode.LATEST)); Assertions.assertEquals( TEST_NUM.withFallbackKeys("option.numeric"), Options.key("option.num") .intType() .defaultValue(100) .withFallbackKeys("option.numeric")); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/ReadableConfigTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.shade.com.typesafe.config.ConfigResolveOptions; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class ReadableConfigTest { private static final String CONFIG_PATH = "/conf/option-test.conf"; private static ReadonlyConfig config; private static Map map; @BeforeAll public static void prepare() throws URISyntaxException { Config rawConfig = ConfigFactory.parseFile( Paths.get(ReadableConfigTest.class.getResource(CONFIG_PATH).toURI()) .toFile()) .resolve(ConfigResolveOptions.defaults().setAllowUnresolved(true)) .resolveWith( ConfigFactory.systemProperties(), ConfigResolveOptions.defaults().setAllowUnresolved(true)); config = ReadonlyConfig.fromConfig(rawConfig.getConfigList("source").get(0)); map = new HashMap<>(); Map inner = new HashMap<>(); inner.put("path", "mac"); inner.put("name", "ashulin"); inner.put("map", "{\"fantasy\":\"final\"}"); map.put("inner", inner); map.put("type", "source"); map.put("patch.note", "hollow"); map.put("name", "saitou"); } @Test public void testBooleanOption() { Assertions.assertEquals( true, config.get(Options.key("option.bool").booleanType().noDefaultValue())); Assertions.assertEquals( false, config.get(Options.key("option.bool-str").booleanType().noDefaultValue())); Assertions.assertEquals( true, config.get(Options.key("option.int-str").booleanType().noDefaultValue())); Assertions.assertNull( config.get(Options.key("option.not-exist").booleanType().noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.string").booleanType().noDefaultValue())); } @Test public void testIntOption() { Assertions.assertEquals( 2147483647, config.get(Options.key("option.int").intType().noDefaultValue())); Assertions.assertEquals( 100, config.get(Options.key("option.int-str").intType().noDefaultValue())); Assertions.assertEquals( 2147483647, config.get(Options.key("option.not-exist").intType().defaultValue(2147483647))); Assertions.assertNull( config.get(Options.key("option.not-exist").intType().noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.long").intType().noDefaultValue())); } @Test public void testLongOption() { Assertions.assertEquals( 21474836470L, config.get(Options.key("option.long").longType().noDefaultValue())); Assertions.assertEquals( 21474836470L, config.get(Options.key("option.long-str").longType().noDefaultValue())); Assertions.assertNull( config.get(Options.key("option.not-exist").longType().noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.bool").intType().noDefaultValue())); } @Test public void testFloatOption() { Assertions.assertEquals( 3.3333F, config.get(Options.key("option.float").floatType().noDefaultValue())); Assertions.assertEquals( 21474836470F, config.get(Options.key("option.long-str").floatType().noDefaultValue())); Assertions.assertEquals( 3.1415F, config.get(Options.key("option.float-str").floatType().noDefaultValue())); Assertions.assertNull( config.get(Options.key("option.not-exist").floatType().noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.bool-str").floatType().noDefaultValue())); } @Test public void testDoubleOption() { Assertions.assertEquals( 3.1415926535897932384626433832795028841971D, config.get(Options.key("option.double").doubleType().noDefaultValue())); Assertions.assertEquals( 3.1415926535897932384626433832795028841971D, config.get(Options.key("option.double-str").doubleType().noDefaultValue())); Assertions.assertEquals( 21474836470D, config.get(Options.key("option.long-str").doubleType().noDefaultValue())); Assertions.assertEquals( 3.1415D, config.get(Options.key("option.float-str").doubleType().noDefaultValue())); Assertions.assertNull( config.get(Options.key("option.not-exist").doubleType().noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.bool-str").doubleType().noDefaultValue())); } @Test public void testStringOption() { Assertions.assertEquals( "Hello, Apache SeaTunnel", config.get(Options.key("option.string").stringType().noDefaultValue())); // 'option.double' is not represented as a string and is expected to lose precision Assertions.assertNotEquals( "3.1415926535897932384626433832795028841971", config.get(Options.key("option.double").stringType().noDefaultValue())); Assertions.assertEquals( "3.1415926535897932384626433832795028841971", config.get(Options.key("option.double-str").stringType().noDefaultValue())); Assertions.assertNull( config.get(Options.key("option.not-exist").stringType().noDefaultValue())); } @Test public void testEnumOption() { Assertions.assertEquals( OptionTest.TestMode.LATEST, config.get( Options.key("option.enum") .enumType(OptionTest.TestMode.class) .noDefaultValue())); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get( Options.key("option.string") .enumType(OptionTest.TestMode.class) .noDefaultValue())); Assertions.assertNull( config.get( Options.key("option.not-exist") .enumType(OptionTest.TestMode.class) .noDefaultValue())); } @Test public void testBasicMapOption() { Assertions.assertEquals( map, config.get( Options.key("option.map") .type(new TypeReference>() {}) .noDefaultValue())); Map newMap = new HashMap<>(); newMap.put("fantasy", "final"); Assertions.assertEquals( newMap, config.get(Options.key("option.map.inner.map").mapType().noDefaultValue())); Assertions.assertTrue( StringUtils.isNotBlank( config.get(Options.key("option").stringType().noDefaultValue()))); Assertions.assertThrows( IllegalArgumentException.class, () -> config.get(Options.key("option.string").mapType().noDefaultValue())); Assertions.assertNull( config.get( Options.key("option.not-exist") .enumType(OptionTest.TestMode.class) .noDefaultValue())); } @Test public void testBasicListOption() { List list = new ArrayList<>(); list.add("Hello"); list.add("Apache SeaTunnel"); Assertions.assertEquals( list, config.get(Options.key("option.list-json").listType().noDefaultValue())); list = new ArrayList<>(); list.add("final"); list.add("fantasy"); list.add("VII"); Assertions.assertEquals( list, config.get(Options.key("option.list").listType().noDefaultValue())); list = new ArrayList<>(); list.add("Silk"); list.add("Song"); Assertions.assertEquals( list, config.get(Options.key("option.list-str").listType().noDefaultValue())); } @Test public void testObjectType() { Assertions.assertEquals( "Hello, Apache SeaTunnel", config.get(Options.key("option.string").objectType(Object.class).noDefaultValue())); Assertions.assertEquals( true, config.get(Options.key("option.bool").objectType(Object.class).noDefaultValue())); Assertions.assertEquals( 3.3333, config.get(Options.key("option.float").objectType(Object.class).noDefaultValue())); Assertions.assertEquals( 21474836470L, config.get(Options.key("option.long").objectType(Object.class).noDefaultValue())); } @Test public void testComplexTypeOption() { List>>>> complexType = config.get( Options.key("option.complex-type") .type( new TypeReference< List< Map< String, Map< String, List< Map< String, Object>>>>>>() {}) .noDefaultValue()); Assertions.assertEquals(1, complexType.size()); Assertions.assertEquals(2, complexType.get(0).get("inner").size()); complexType .get(0) .get("inner") .values() .forEach( value -> { Assertions.assertEquals(map, value.get(0)); }); Assertions.assertEquals(complexType.get(0).get("inner").get("list").size(), 2); Assertions.assertEquals(complexType.get(0).get("inner").get("list-2").size(), 1); } @Test public void testEnumListOption() { List list = new ArrayList<>(); list.add(OptionTest.TestMode.EARLIEST); list.add(OptionTest.TestMode.LATEST); Assertions.assertEquals( list, config.get( Options.key("option.enum-list") .listType(OptionTest.TestMode.class) .noDefaultValue())); } @Test public void testNumericListOption() { List list = new ArrayList<>(); list.add(1); list.add(2); Assertions.assertEquals( list, config.get( Options.key("option.numeric-list") .listType(Integer.class) .noDefaultValue())); List list2 = new ArrayList<>(); list2.add(1L); list2.add(2L); Assertions.assertEquals( list2, config.get( Options.key("option.numeric-list").listType(Long.class).noDefaultValue())); List list3 = new ArrayList<>(); list3.add(1D); list3.add(2D); Assertions.assertEquals( list3, config.get( Options.key("option.numeric-list") .listType(Double.class) .noDefaultValue())); } @Test public void testFallbackKey() { Map map = new HashMap<>(); map.put("user", "ashulin"); final Option usernameOption = Options.key("username").stringType().noDefaultValue().withFallbackKeys("user"); ReadonlyConfig readonlyConfig = ReadonlyConfig.fromMap(map); Assertions.assertEquals("ashulin", readonlyConfig.get(usernameOption)); Assertions.assertNull( readonlyConfig.get(Options.key("username").stringType().noDefaultValue())); map.put("username", "ark"); readonlyConfig = ReadonlyConfig.fromMap(map); Assertions.assertEquals("ark", readonlyConfig.get(usernameOption)); } @Test public void testNullValue() { Map map = new HashMap<>(); map.put("user", null); ReadonlyConfig readonlyConfig = ReadonlyConfig.fromMap(map); Assertions.assertNull(readonlyConfig.toMap().get("user")); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/ConditionTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.OptionTest; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import static org.apache.seatunnel.api.configuration.OptionTest.TEST_MODE; import static org.apache.seatunnel.api.configuration.OptionTest.TEST_NUM; public class ConditionTest { private static final Condition TEST_CONDITION = Condition.of(TEST_MODE, OptionTest.TestMode.EARLIEST) .or(TEST_MODE, OptionTest.TestMode.LATEST) .and(TEST_NUM, 1000); @Test public void testToString() { Assertions.assertEquals( "('option.mode' == EARLIEST || 'option.mode' == LATEST) && 'option.num' == 1000", TEST_CONDITION.toString()); } @Test public void testGetCount() { Assertions.assertEquals(3, TEST_CONDITION.getCount()); } @Test public void testGetTailCondition() { Assertions.assertEquals(Condition.of(TEST_NUM, 1000), TEST_CONDITION.getTailCondition()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/ConfigUtilTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; import java.nio.file.Paths; public class ConfigUtilTest { private static Config config; @BeforeAll public static void init() throws URISyntaxException { config = ConfigFactory.parseFile( Paths.get( ConfigUtilTest.class .getResource("/conf/option-test.conf") .toURI()) .toFile()); } @Test public void convertToJsonString() { String configJson = ConfigUtil.convertToJsonString(config); Config parsedConfig = ConfigUtil.convertToConfig(configJson); Assertions.assertEquals(config.getConfig("env"), parsedConfig.getConfig("env")); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/ConfigValidatorTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.OptionTest; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.function.Executable; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import static org.apache.seatunnel.api.configuration.OptionTest.TEST_MODE; import static org.apache.seatunnel.api.configuration.util.OptionRuleTest.TEST_PORTS; import static org.apache.seatunnel.api.configuration.util.OptionRuleTest.TEST_TIMESTAMP; import static org.apache.seatunnel.api.configuration.util.OptionRuleTest.TEST_TOPIC; import static org.apache.seatunnel.api.configuration.util.OptionRuleTest.TEST_TOPIC_PATTERN; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class ConfigValidatorTest { public static final Option KEY_USERNAME = Options.key("username") .stringType() .noDefaultValue() .withDescription("username of the Neo4j"); public static final Option KEY_PASSWORD = Options.key("password") .stringType() .noDefaultValue() .withDescription("password of the Neo4j"); public static final Option KEY_BEARER_TOKEN = Options.key("bearer-token") .stringType() .noDefaultValue() .withDescription("base64 encoded bearer token of the Neo4j. for Auth."); public static final Option KEY_KERBEROS_TICKET = Options.key("kerberos-ticket") .stringType() .noDefaultValue() .withDescription("base64 encoded kerberos ticket of the Neo4j. for Auth."); public static final Option SINGLE_CHOICE_TEST = Options.key("single_choice_test") .singleChoice(String.class, Arrays.asList("A", "B", "C")) .defaultValue("M") .withDescription("test single choice error"); public static final Option SINGLE_CHOICE_VALUE_TEST = Options.key("single_choice_test") .singleChoice(String.class, Arrays.asList("A", "B", "C")) .defaultValue("A") .withDescription("test single choice value"); void validate(Map config, OptionRule rule) { ConfigValidator.of(ReadonlyConfig.fromMap(config)).validate(rule); } @Test public void testAbsolutelyRequiredOption() { OptionRule rule = OptionRule.builder().required(TEST_PORTS, KEY_USERNAME, KEY_PASSWORD).build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // absent config.put(TEST_PORTS.key(), "[9090]"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('username', 'password') are required.", assertThrows(OptionValidationException.class, executable).getMessage()); config.put(KEY_USERNAME.key(), "asuka"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('password') are required.", assertThrows(OptionValidationException.class, executable).getMessage()); // all present config.put(KEY_PASSWORD.key(), "saitou"); Assertions.assertDoesNotThrow(executable); } @Test public void testBundledRequiredOptions() { OptionRule rule = OptionRule.builder().bundled(KEY_USERNAME, KEY_PASSWORD).build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // case1: all absent Assertions.assertDoesNotThrow(executable); // case2: some present config.put(KEY_USERNAME.key(), "asuka"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - These options('username', 'password') are bundled, must be present or absent together." + " The options present are: 'username'. The options absent are 'password'.", assertThrows(OptionValidationException.class, executable).getMessage()); // case2: all present config.put(KEY_PASSWORD.key(), "saitou"); Assertions.assertDoesNotThrow(executable); } @Test public void testSimpleExclusiveRequiredOptions() { OptionRule rule = OptionRule.builder().exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC).build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // all absent assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, these options('option.topic-pattern', 'option.topic') are mutually exclusive," + " allowing only one set(\"[] for a set\") of options to be configured.", assertThrows(OptionValidationException.class, executable).getMessage()); // only one present config.put(TEST_TOPIC_PATTERN.key(), "asuka"); Assertions.assertDoesNotThrow(executable); // present > 1 config.put(TEST_TOPIC.key(), "[\"saitou\"]"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - These options('option.topic-pattern', 'option.topic') are mutually exclusive, " + "allowing only one set(\"[] for a set\") of options to be configured.", assertThrows(OptionValidationException.class, executable).getMessage()); } @Test public void testComplexExclusiveRequiredOptions() { OptionRule rule = OptionRule.builder().exclusive(KEY_BEARER_TOKEN, KEY_KERBEROS_TICKET).build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // all absent assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, these options('bearer-token', 'kerberos-ticket') are mutually exclusive," + " allowing only one set(\"[] for a set\") of options to be configured.", assertThrows(OptionValidationException.class, executable).getMessage()); // set one config.put(KEY_BEARER_TOKEN.key(), "ashulin"); Assertions.assertDoesNotThrow(executable); // all set config.put(KEY_KERBEROS_TICKET.key(), "zongwen"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - These options('bearer-token', 'kerberos-ticket') are mutually exclusive," + " allowing only one set(\"[] for a set\") of options to be configured.", assertThrows(OptionValidationException.class, executable).getMessage()); } @Test public void testSimpleConditionalRequiredOptionsWithDefaultValue() { OptionRule rule = OptionRule.builder() .optional(TEST_MODE) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // Expression mismatch Assertions.assertDoesNotThrow(executable); // Expression match, and required options absent config.put(TEST_MODE.key(), "timestamp"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('option.timestamp') are required" + " because ['option.mode' == TIMESTAMP] is true.", assertThrows(OptionValidationException.class, executable).getMessage()); // Expression match, and required options all present config.put(TEST_TIMESTAMP.key(), "564231238596789"); Assertions.assertDoesNotThrow(executable); // Expression mismatch config.put(TEST_MODE.key(), "EARLIEST"); Assertions.assertDoesNotThrow(executable); } @Test public void testSimpleConditionalRequiredOptionsWithoutDefaultValue() { OptionRule rule = OptionRule.builder() .optional(KEY_USERNAME) .conditional(KEY_USERNAME, "ashulin", TEST_TIMESTAMP) .build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // Expression mismatch Assertions.assertDoesNotThrow(executable); // Expression match, and required options absent config.put(KEY_USERNAME.key(), "ashulin"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('option.timestamp') are required" + " because ['username' == ashulin] is true.", assertThrows(OptionValidationException.class, executable).getMessage()); // Expression match, and required options all present config.put(TEST_TIMESTAMP.key(), "564231238596789"); Assertions.assertDoesNotThrow(executable); // Expression mismatch config.put(KEY_USERNAME.key(), "asuka"); Assertions.assertDoesNotThrow(executable); } @Test public void testComplexConditionalRequiredOptions() { OptionRule rule = OptionRule.builder() .optional(KEY_USERNAME) .conditional( KEY_USERNAME, Arrays.asList("ashulin", "asuka"), TEST_TIMESTAMP) .build(); Map config = new HashMap<>(); Executable executable = () -> validate(config, rule); // Expression mismatch Assertions.assertDoesNotThrow(executable); // 'username' == ashulin, and required options absent config.put(KEY_USERNAME.key(), "ashulin"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('option.timestamp') are required" + " because ['username' == ashulin || 'username' == asuka] is true.", assertThrows(OptionValidationException.class, executable).getMessage()); // 'username' == asuka, and required options absent config.put(KEY_USERNAME.key(), "asuka"); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - There are unconfigured options, the options('option.timestamp') are required" + " because ['username' == ashulin || 'username' == asuka] is true.", assertThrows(OptionValidationException.class, executable).getMessage()); // Expression match, and required options all present config.put(TEST_TIMESTAMP.key(), "564231238596789"); Assertions.assertDoesNotThrow(executable); // Expression mismatch config.put(KEY_USERNAME.key(), "asuka111"); Assertions.assertDoesNotThrow(executable); } @Test public void testSingleChoiceOptionDefaultValueValidator() { OptionRule optionRule = OptionRule.builder().required(SINGLE_CHOICE_TEST).build(); Map config = new HashMap<>(); config.put(SINGLE_CHOICE_TEST.key(), "A"); Executable executable = () -> validate(config, optionRule); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - These options('single_choice_test') are SingleChoiceOption, the defaultValue(M) must be one of the optionValues([A, B, C]).", assertThrows(OptionValidationException.class, executable).getMessage()); } @Test public void testSingleChoiceOptionValueValidator() { OptionRule optionRule = OptionRule.builder().required(SINGLE_CHOICE_VALUE_TEST).build(); Map config = new HashMap<>(); config.put(SINGLE_CHOICE_VALUE_TEST.key(), "A"); Executable executable = () -> validate(config, optionRule); Assertions.assertDoesNotThrow(executable); config.put(SINGLE_CHOICE_VALUE_TEST.key(), "N"); executable = () -> validate(config, optionRule); assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - These options('single_choice_test') are SingleChoiceOption, the value(N) must be one of the optionValues([A, B, C]).", assertThrows(OptionValidationException.class, executable).getMessage()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/OptionRuleTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.type.TypeReference; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.OptionTest; import org.apache.seatunnel.api.configuration.Options; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.function.Executable; import java.util.List; import static org.apache.seatunnel.api.configuration.OptionTest.TEST_MODE; import static org.apache.seatunnel.api.configuration.OptionTest.TEST_NUM; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; public class OptionRuleTest { public static final Option TEST_TIMESTAMP = Options.key("option.timestamp") .longType() .noDefaultValue() .withDescription("test long timestamp"); public static final Option TEST_TOPIC_PATTERN = Options.key("option.topic-pattern") .stringType() .noDefaultValue() .withDescription("test string type"); public static final Option> TEST_TOPIC = Options.key("option.topic") .listType() .noDefaultValue() .withDescription("test list string type"); public static final Option> TEST_PORTS = Options.key("option.ports") .type(new TypeReference>() {}) .noDefaultValue() .withDescription("test list int type"); public static final Option TEST_REQUIRED_HAVE_DEFAULT_VALUE = Options.key("option.required-have-default") .stringType() .defaultValue("11") .withDescription("test string type"); public static final Option TEST_DUPLICATE = Options.key("option.test-duplicate") .stringType() .noDefaultValue() .withDescription("test string type"); @Test public void testBuildSuccess() { OptionRule rule = OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .required(TEST_PORTS) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); Assertions.assertNotNull(rule); } @Test public void testVerify() { Executable executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .required(TEST_PORTS, TEST_REQUIRED_HAVE_DEFAULT_VALUE) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); }; executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE, TEST_REQUIRED_HAVE_DEFAULT_VALUE) .required(TEST_PORTS, TEST_REQUIRED_HAVE_DEFAULT_VALUE) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); }; // test duplicate assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - AbsolutelyRequiredOptions 'option.required-have-default' duplicate in option options.", assertThrows(OptionValidationException.class, executable).getMessage()); executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC, TEST_DUPLICATE) .required(TEST_PORTS, TEST_DUPLICATE) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); }; // test duplicate in RequiredOption$ExclusiveRequiredOptions assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - AbsolutelyRequiredOptions 'option.test-duplicate' duplicate in ExclusiveRequiredOptions options.", assertThrows(OptionValidationException.class, executable).getMessage()); executable = () -> { OptionRule.builder() .optional(TEST_NUM) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .required(TEST_PORTS) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); }; // test conditional not found in other options assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - Conditional 'option.mode' not found in options.", assertThrows(OptionValidationException.class, executable).getMessage()); executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .required(TEST_PORTS) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .conditional(TEST_NUM, 100, TEST_TIMESTAMP) .build(); }; // test parameter can only be controlled by one other parameter assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - ConditionalRequiredOptions 'option.timestamp' duplicate in ConditionalRequiredOptions options.", assertThrows(OptionValidationException.class, executable).getMessage()); // Test conditional only does not conflict with optional options // Test option TEST_TIMESTAMP executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE, TEST_TIMESTAMP) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .required(TEST_PORTS) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .conditional(TEST_MODE, OptionTest.TestMode.LATEST, TEST_TIMESTAMP) .build(); }; assertDoesNotThrow(executable); executable = () -> { OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC, TEST_TIMESTAMP) .required(TEST_PORTS) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); }; assertEquals( "ErrorCode:[API-02], ErrorDescription:[Option item validate failed] - ConditionalRequiredOptions 'option.timestamp' duplicate in ExclusiveRequiredOptions options.", assertThrows(OptionValidationException.class, executable).getMessage()); } @Test public void testEquals() { OptionRule rule1 = OptionRule.builder() .optional(TEST_NUM, TEST_MODE) .required(TEST_PORTS) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); OptionRule rule2 = OptionRule.builder() .optional(TEST_NUM) .optional(TEST_MODE) .required(TEST_PORTS) .exclusive(TEST_TOPIC_PATTERN, TEST_TOPIC) .conditional(TEST_MODE, OptionTest.TestMode.TIMESTAMP, TEST_TIMESTAMP) .build(); Assertions.assertEquals(rule1, rule2); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/OptionUtilTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Comparator; import java.util.List; import java.util.Map; public class OptionUtilTest { @Test public void test() throws InstantiationException, IllegalAccessException { List> options = OptionUtil.getOptions(TestOptionConfig.class); options.sort(Comparator.comparing(Option::key)); Assertions.assertEquals(Boolean.class, options.get(0).typeReference().getType()); Assertions.assertEquals(true, options.get(0).defaultValue()); Assertions.assertEquals(Byte.class, options.get(1).typeReference().getType()); Assertions.assertEquals(Character.class, options.get(2).typeReference().getType()); Assertions.assertEquals(Double.class, options.get(3).typeReference().getType()); Assertions.assertEquals( TestOptionConfigEnum.class, options.get(4).typeReference().getType()); Assertions.assertEquals(TestOptionConfigEnum.KEY2, options.get(4).defaultValue()); Assertions.assertEquals(Float.class, options.get(5).typeReference().getType()); Assertions.assertEquals(Integer.class, options.get(6).typeReference().getType()); Assertions.assertEquals("int_value", options.get(6).key()); Assertions.assertEquals("", options.get(6).getDescription()); Assertions.assertNull(options.get(6).defaultValue()); Assertions.assertEquals(List.class, options.get(7).typeReference().getType()); Assertions.assertEquals(Long.class, options.get(8).typeReference().getType()); Assertions.assertEquals(Map.class, options.get(9).typeReference().getType()); Assertions.assertEquals(TestOptionConfig.class, options.get(10).typeReference().getType()); Assertions.assertEquals("short-value", options.get(11).key()); Assertions.assertEquals("shortValue", options.get(11).getDescription()); Assertions.assertEquals(Short.class, options.get(11).typeReference().getType()); Assertions.assertEquals(String.class, options.get(12).typeReference().getType()); Assertions.assertEquals("default string", options.get(12).defaultValue()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/SingleChoiceOptionTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.configuration.SingleChoiceOption; import org.apache.seatunnel.api.sink.DataSaveMode; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.List; public class SingleChoiceOptionTest { @Test public void test() { Option stringOption = Options.key("test_single_choice") .singleChoice(String.class, Arrays.asList("A", "B", "C")) .defaultValue("A"); Option saveModeOption = Options.key("save_mode") .singleChoice( DataSaveMode.class, Arrays.asList(DataSaveMode.APPEND_DATA, DataSaveMode.DROP_DATA)) .defaultValue(DataSaveMode.APPEND_DATA) .withDescription("save mode test"); OptionRule build = OptionRule.builder().optional(stringOption, saveModeOption).build(); List> optionalOptions = build.getOptionalOptions(); Option option = optionalOptions.get(0); Assertions.assertTrue(SingleChoiceOption.class.isAssignableFrom(option.getClass())); SingleChoiceOption singleChoiceOption = (SingleChoiceOption) option; Assertions.assertEquals(3, singleChoiceOption.getOptionValues().size()); Assertions.assertEquals("A", singleChoiceOption.defaultValue()); option = optionalOptions.get(1); singleChoiceOption = (SingleChoiceOption) option; Assertions.assertEquals(2, singleChoiceOption.getOptionValues().size()); Assertions.assertEquals(DataSaveMode.APPEND_DATA, singleChoiceOption.defaultValue()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/TestOptionConfig.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; import lombok.Data; import java.util.List; import java.util.Map; @Data public class TestOptionConfig { @OptionMark(name = "short-value", description = "shortValue") private Short shortValue; @OptionMark private Integer intValue; @OptionMark(description = "longValue") private Long longValue; @OptionMark(description = "floatValue") private Float floatValue; @OptionMark(description = "doubleValue") private Double doubleValue; @OptionMark(description = "stringValue") private String stringValue = "default string"; @OptionMark(description = "booleanValue") private Boolean booleanValue = true; @OptionMark(description = "byteValue") private Byte byteValue; @OptionMark(description = "charValue") private Character charValue; @OptionMark(description = "enumValue") private TestOptionConfigEnum enumValue = TestOptionConfigEnum.KEY2; @OptionMark(description = "objectValue") private TestOptionConfig objectValue; @OptionMark(description = "listValue") private List listValue; @OptionMark(description = "mapValue") private Map mapValue; } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/util/TestOptionConfigEnum.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.configuration.util; public enum TestOptionConfigEnum { KEY1, KEY2 } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/env/EnvOptionRuleTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.env; import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.options.EnvOptionRule; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class EnvOptionRuleTest { @Test public void testGetEnvOptionRules() throws Exception { OptionRule envOptionRules = new EnvOptionRule().optionRule(); Assertions.assertNotNull(envOptionRules); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/metalake/TableSchemaDiscovererTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.node.ArrayNode; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.metalake.gravitino.GravitinoTableSchemaConvertor; import org.apache.seatunnel.api.options.EnvCommonOptions; import org.apache.seatunnel.api.options.table.TableSchemaOptions; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.factory.TableSourceFactoryContext; import org.apache.seatunnel.common.constants.MetaLakeType; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import java.io.File; import java.net.URISyntaxException; import java.net.URL; import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import java.util.Map; import static org.mockito.Mockito.when; @ExtendWith(MockitoExtension.class) public class TableSchemaDiscovererTest { private static final String TEST_CATALOG_NAME = "test_catalog"; @Mock private MetalakeClient metalakeClient; private final MetaLakeTableSchemaConvertor convertor = new GravitinoTableSchemaConvertor(); @Test void testDiscoverTableSchemasWithSingleSchemaFields() throws URISyntaxException { Config config = loadConfig("/conf/table_schema_discoverer/single_schema_field.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, null, null)) { Assertions.assertFalse(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); Assertions.assertEquals(1, result.size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(0).getCatalogName()); Assertions.assertEquals( TablePath.of("default", "default", "default"), result.get(0).getTablePath()); Assertions.assertEquals(3, result.get(0).getTableSchema().getColumns().size()); } } @Test void testDiscoverTableSchemasWithSingleSchemaSchemaUrl() throws Exception { Config config = loadConfig("/conf/table_schema_discoverer/single_schema_url.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); // Mock setup with real JsonNode structure JsonNode schemaNode = createMockTableSchemaNode("test_table"); String schemaUrl = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/test_table"; when(metalakeClient.getTableSchema(schemaUrl)).thenReturn(schemaNode); when(metalakeClient.getTableSchemaPath(schemaUrl)) .thenReturn(TablePath.of("test_catalog", "test_schema", "test_table")); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, metalakeClient, convertor)) { Assertions.assertTrue(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); Assertions.assertEquals(1, result.size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(0).getCatalogName()); Assertions.assertEquals( TablePath.of("test_catalog", "test_schema", "test_table"), result.get(0).getTablePath()); Assertions.assertEquals(2, result.get(0).getTableSchema().getColumns().size()); } } @Test void testDiscoverTableSchemasWithMultipleTablesFields() throws URISyntaxException { Config config = loadConfig("/conf/table_schema_discoverer/multiple_tables_fields.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, null, null)) { Assertions.assertFalse(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); Assertions.assertEquals(2, result.size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(0).getCatalogName()); Assertions.assertEquals( TablePath.of("db", null, "table1"), result.get(0).getTablePath()); Assertions.assertEquals(1, result.get(0).getTableSchema().getColumns().size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(1).getCatalogName()); Assertions.assertEquals( TablePath.of("db", null, "table2"), result.get(1).getTablePath()); Assertions.assertEquals(3, result.get(1).getTableSchema().getColumns().size()); } } @Test void testDiscoverTableSchemasWithMultipleTablesSchemaUrl() throws Exception { Config config = loadConfig("/conf/table_schema_discoverer/multiple_tables_schema_url.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); // url String schemaUrl1 = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table1"; String schemaUrl2 = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table2"; // Mock setup with real JsonNode structure JsonNode schemaNode1 = createMockTableSchemaNode("table1"); JsonNode schemaNode2 = createMockTableSchemaNode("table2"); // json node when(metalakeClient.getTableSchema(schemaUrl1)).thenReturn(schemaNode1); when(metalakeClient.getTableSchema(schemaUrl2)).thenReturn(schemaNode2); when(metalakeClient.getTableSchemaPath(schemaUrl2)) .thenReturn(TablePath.of("test_catalog", "test_schema", "table2")); // discoverer try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, metalakeClient, convertor)) { List result = discoverer.discoverTableSchemas(); Assertions.assertTrue(discoverer.enableMetaLakeClient(sourceOptions)); Assertions.assertEquals(2, result.size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(0).getCatalogName()); Assertions.assertEquals( TablePath.of("test_database.test_schema.test_table1"), result.get(0).getTablePath()); Assertions.assertEquals(2, result.get(0).getTableSchema().getColumns().size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(1).getCatalogName()); Assertions.assertEquals( TablePath.of("test_catalog", "test_schema", "table2"), result.get(1).getTablePath()); Assertions.assertEquals(2, result.get(1).getTableSchema().getColumns().size()); } } @Test void testDiscoverTableSchemasWithMultipleTablesMixedFieldsAndSchemaUrl() throws Exception { Config config = loadConfig("/conf/table_schema_discoverer/multiple_tables_mixed.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); JsonNode schemaNode2 = createMockTableSchemaNode("table2"); String url2 = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table2"; when(metalakeClient.getTableSchema(url2)).thenReturn(schemaNode2); when(metalakeClient.getTableSchemaPath(url2)) .thenReturn(TablePath.of("test_catalog", "test_schema", "table2")); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, metalakeClient, convertor)) { Assertions.assertTrue(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); Assertions.assertEquals(2, result.size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(0).getCatalogName()); Assertions.assertEquals(TablePath.of("db.table1"), result.get(0).getTablePath()); Assertions.assertEquals(2, result.get(0).getTableSchema().getColumns().size()); Assertions.assertEquals(TEST_CATALOG_NAME, result.get(1).getCatalogName()); Assertions.assertEquals( TablePath.of("test_catalog.test_schema.table2"), result.get(1).getTablePath()); Assertions.assertEquals(2, result.get(1).getTableSchema().getColumns().size()); } } @Test void testGetMetaLakeTypeFromSourceOptions() { Map sourceConfig = new HashMap<>(); sourceConfig.put(TableSchemaOptions.METALAKE_TYPE.key(), MetaLakeType.GRAVITINO.name()); ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(sourceConfig); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testGetMetaLakeTypeFromEnvOptions() { ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(new HashMap<>()); Map envConfig = new HashMap<>(); envConfig.put(EnvCommonOptions.METALAKE_TYPE.key(), MetaLakeType.GRAVITINO.name()); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(envConfig); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testGetMetaLakeTypeFromSystemEnvironment() { ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(new HashMap<>()); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); System.setProperty( EnvCommonOptions.METALAKE_TYPE.key().toUpperCase(), MetaLakeType.GRAVITINO.name()); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testGetMetaLakeTypeDefaultValue() { ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(new HashMap<>()); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testGetMetaLakeTypePrioritySourceOverEnv() { Map sourceConfig = new HashMap<>(); sourceConfig.put(TableSchemaOptions.METALAKE_TYPE.key(), MetaLakeType.GRAVITINO.name()); ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(sourceConfig); Map envConfig = new HashMap<>(); envConfig.put(EnvCommonOptions.METALAKE_TYPE.key(), "other_type"); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(envConfig); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testGetMetaLakeTypePriorityEnvOverSystem() { ReadonlyConfig sourceOptions = ReadonlyConfig.fromMap(new HashMap<>()); Map envConfig = new HashMap<>(); envConfig.put(EnvCommonOptions.METALAKE_TYPE.key(), MetaLakeType.GRAVITINO.name()); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(envConfig); TableSourceFactoryContext context = new TableSourceFactoryContext( sourceOptions, getClass().getClassLoader(), envOptions); System.setProperty(EnvCommonOptions.METALAKE_TYPE.key().toUpperCase(), "other_type"); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer(context, TEST_CATALOG_NAME)) { MetaLakeType result = discoverer.getMetaLakeType(); Assertions.assertEquals(MetaLakeType.GRAVITINO, result); } } @Test void testDiscoverTableSchemaWithSingleParquetNoSchema() throws URISyntaxException { Config config = loadConfig("/conf/table_schema_discoverer/single_no_schema.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, null, null)) { Assertions.assertFalse(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); // When no schema is configured, should return a simple text table Assertions.assertEquals(1, result.size()); // Catalog name is "schema" from buildSimpleTextTable() Assertions.assertEquals("schema", result.get(0).getCatalogName()); // TablePath is (database="default", schema=null, tableName="default") Assertions.assertEquals( TablePath.of("default", null, "default"), result.get(0).getTablePath()); Assertions.assertNotNull(result.get(0).getTableSchema()); Assertions.assertEquals(1, result.get(0).getTableSchema().getColumns().size()); Assertions.assertEquals( "content", result.get(0).getTableSchema().getColumns().get(0).getName()); } } @Test void testDiscoverTableSchemasWithMultipleTablesNoSchemaMixedFormat() throws URISyntaxException { Config config = loadConfig( "/conf/table_schema_discoverer/multiple_tables_no_schema_mixed_format.conf"); ReadonlyConfig sourceOptions = ReadonlyConfig.fromConfig(config); ReadonlyConfig envOptions = ReadonlyConfig.fromMap(new HashMap<>()); try (TableSchemaDiscoverer discoverer = new TableSchemaDiscoverer( envOptions, sourceOptions, TEST_CATALOG_NAME, null, null)) { Assertions.assertFalse(discoverer.enableMetaLakeClient(sourceOptions)); List result = discoverer.discoverTableSchemas(); // Should return 3 tables for parquet, orc, and binary file formats Assertions.assertEquals(3, result.size()); // First table (parquet) - db.parquet_table // catalogName is "schema" from buildSimpleTextTable() Assertions.assertEquals("schema", result.get(0).getCatalogName()); Assertions.assertEquals( TablePath.of("db", "parquet_table"), result.get(0).getTablePath()); Assertions.assertNotNull(result.get(0).getTableSchema()); Assertions.assertEquals(1, result.get(0).getTableSchema().getColumns().size()); Assertions.assertEquals( "content", result.get(0).getTableSchema().getColumns().get(0).getName()); // Second table (orc) - db.orc_table Assertions.assertEquals("schema", result.get(1).getCatalogName()); Assertions.assertEquals(TablePath.of("db", "orc_table"), result.get(1).getTablePath()); Assertions.assertNotNull(result.get(1).getTableSchema()); Assertions.assertEquals(1, result.get(1).getTableSchema().getColumns().size()); Assertions.assertEquals( "content", result.get(1).getTableSchema().getColumns().get(0).getName()); // Third table (binary) - db.binary_table Assertions.assertEquals("schema", result.get(2).getCatalogName()); Assertions.assertEquals( TablePath.of("db", "binary_table"), result.get(2).getTablePath()); Assertions.assertNotNull(result.get(2).getTableSchema()); Assertions.assertEquals(1, result.get(2).getTableSchema().getColumns().size()); Assertions.assertEquals( "content", result.get(2).getTableSchema().getColumns().get(0).getName()); } } /** * Load configuration file from test resources. * * @param configPath the path to the configuration file * @return the Config object * @throws URISyntaxException if the path is invalid */ private Config loadConfig(String configPath) throws URISyntaxException { URL resourceUrl = getClass().getResource(configPath); if (resourceUrl == null) { throw new IllegalArgumentException("Config file not found: " + configPath); } File configFile = Paths.get(resourceUrl.toURI()).toFile(); return ConfigFactory.parseFile(configFile); } /** * Create a mock table schema JsonNode for testing. The structure matches Gravitino's table * schema format. */ private JsonNode createMockTableSchemaNode(String tableName) { ObjectMapper mapper = new ObjectMapper(); // Create table node ObjectNode tableNode = mapper.createObjectNode(); tableNode.put("name", tableName); // Create columns array ArrayNode columnsArray = mapper.createArrayNode(); // Column 1: id (integer, not null) ObjectNode column1 = mapper.createObjectNode(); column1.put("name", "id"); column1.put("type", "integer"); column1.put("nullable", false); column1.put("autoIncrement", false); columnsArray.add(column1); // Column 2: big_number (long, nullable, with default value) ObjectNode column2 = mapper.createObjectNode(); column2.put("name", "big_number"); column2.put("type", "long"); column2.put("nullable", true); column2.put("autoIncrement", false); // Default value node ObjectNode defaultValue = mapper.createObjectNode(); defaultValue.put("type", "literal"); defaultValue.put("dataType", "null"); defaultValue.put("value", "NULL"); column2.set("defaultValue", defaultValue); columnsArray.add(column2); tableNode.set("columns", columnsArray); return tableNode; } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/metalake/gravitino/GravitinoClientTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake.gravitino; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.http.HttpEntity; import org.apache.http.StatusLine; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.impl.client.CloseableHttpClient; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoSettings; import org.mockito.quality.Strictness; import java.io.ByteArrayInputStream; import java.io.IOException; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @ExtendWith(MockitoExtension.class) @MockitoSettings(strictness = Strictness.LENIENT) public class GravitinoClientTest { private static final String TEST_URL = "http://localhost:8090/api/test/tables/test_table"; @Mock private CloseableHttpClient mockHttpClient; @Mock private CloseableHttpResponse mockResponse; @Mock private HttpEntity mockEntity; @Mock private StatusLine mockStatusLine; // ========== TablePath Parsing Tests ========== @Test void testGetTableSchemaPathWithFullUrl() { String url = "http://localhost:8090/catalogs/postgres/schemas/public/tables/users"; try (GravitinoClient client = new GravitinoClient()) { TablePath tablePath = client.getTableSchemaPath(url); Assertions.assertNotNull(tablePath); Assertions.assertEquals("postgres", tablePath.getDatabaseName()); Assertions.assertEquals("public", tablePath.getSchemaName()); Assertions.assertEquals("users", tablePath.getTableName()); } } @Test void testIOExceptionRetrySuccessAfterFailure() throws Exception { // Setup: first two calls fail with IOException, third succeeds setupMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())) .thenThrow(new IOException("Connection timeout")) .thenThrow(new IOException("Connection reset")) .thenReturn(mockResponse); // Execute try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { JsonNode result = client.getTableSchema(TEST_URL); // Verify success Assertions.assertNotNull(result); Assertions.assertEquals("test_table", result.get("name").asText()); } // Verify exactly 3 attempts were made verify(mockHttpClient, times(3)).execute(any()); } @Test void testIOExceptionRetryExhaustedThrowsException() throws IOException { // Setup: all calls fail with IOException when(mockHttpClient.execute(any())).thenThrow(new IOException("Connection timeout")); // Execute try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { Exception exception = Assertions.assertThrows(Exception.class, () -> client.getTableSchema(TEST_URL)); // Verify exception message contains URL and retry count Assertions.assertTrue( exception.getMessage().contains(TEST_URL), "Exception message should contain URL"); Assertions.assertTrue( exception.getMessage().contains("3 attempts"), "Exception message should contain retry count"); } // Verify exactly 3 attempts were made (MAX_RETRY_ATTEMPTS) verify(mockHttpClient, times(3)).execute(any()); } @Test void testIOExceptionRetryWithSingleFailureThenSuccess() throws Exception { // Setup: first call fails, second succeeds setupMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())) .thenThrow(new IOException("Read timed out")) .thenReturn(mockResponse); // Execute try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { JsonNode result = client.getTableSchema(TEST_URL); Assertions.assertNotNull(result); Assertions.assertEquals("test_table", result.get("name").asText()); } // Verify 2 attempts were made verify(mockHttpClient, times(2)).execute(any()); } @Test void testRetryableStatus503SuccessAfterRetry() throws Exception { // Setup: first call returns 503, second succeeds setupMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())).thenReturn(mockResponse).thenReturn(mockResponse); // Configure first response with 503, second with 200 setupMockResponseStatusLine(503); when(mockHttpClient.execute(any())) .thenReturn(mockResponse) .thenAnswer( invocation -> { setupMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); return mockResponse; }); // Re-setup with proper sequence resetMocks(); CloseableHttpResponse response503 = createMockResponse(503, null); CloseableHttpResponse response200 = createMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())).thenReturn(response503).thenReturn(response200); try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { JsonNode result = client.getTableSchema(TEST_URL); Assertions.assertNotNull(result); Assertions.assertEquals("test_table", result.get("name").asText()); } verify(mockHttpClient, times(2)).execute(any()); } @Test void testRetryableStatus500IsRetried() throws Exception { // Setup: first returns 500, second succeeds CloseableHttpResponse response500 = createMockResponse(500, null); CloseableHttpResponse response200 = createMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())).thenReturn(response500).thenReturn(response200); try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { JsonNode result = client.getTableSchema(TEST_URL); Assertions.assertNotNull(result); Assertions.assertEquals("test_table", result.get("name").asText()); } verify(mockHttpClient, times(2)).execute(any()); } @Test void testNonRetryableStatus404FailsImmediately() throws IOException { // Setup: 404 Not Found (non-retryable) CloseableHttpResponse response404 = createMockResponse(404, null); when(mockHttpClient.execute(any())).thenReturn(response404); try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { Exception exception = Assertions.assertThrows(Exception.class, () -> client.getTableSchema(TEST_URL)); Assertions.assertTrue(exception.getMessage().contains("404")); } // Verify only 1 attempt was made verify(mockHttpClient, times(1)).execute(any()); } @Test void testMixedFailuresBeforeSuccess() throws Exception { // Setup: IOException, then 503, then success CloseableHttpResponse response503 = createMockResponse(503, null); CloseableHttpResponse response200 = createMockResponse(200, "{\"table\":{\"name\":\"test_table\"}}"); when(mockHttpClient.execute(any())) .thenThrow(new IOException("Connection reset")) .thenReturn(response503) .thenReturn(response200); try (GravitinoClient client = new GravitinoClient(mockHttpClient)) { JsonNode result = client.getTableSchema(TEST_URL); Assertions.assertNotNull(result); } // Verify 3 attempts were made verify(mockHttpClient, times(3)).execute(any()); } /** Helper method to setup mock response with JSON content. */ private void setupMockResponse(int statusCode, String jsonContent) throws IOException { when(mockResponse.getStatusLine()).thenReturn(mockStatusLine); when(mockStatusLine.getStatusCode()).thenReturn(statusCode); if (jsonContent != null) { when(mockResponse.getEntity()).thenReturn(mockEntity); when(mockEntity.getContent()) .thenReturn(new ByteArrayInputStream(jsonContent.getBytes())); when(mockEntity.isStreaming()).thenReturn(false); } } /** Helper method to setup mock status line. */ private void setupMockResponseStatusLine(int statusCode) { when(mockResponse.getStatusLine()).thenReturn(mockStatusLine); when(mockStatusLine.getStatusCode()).thenReturn(statusCode); } /** Reset mock configurations. */ private void resetMocks() { org.mockito.Mockito.reset(mockHttpClient, mockResponse, mockEntity, mockStatusLine); } /** * Create a mock HTTP response with specified status code and optional JSON content. * * @param statusCode HTTP status code * @param jsonContent JSON content (null for error responses without body) * @return mock CloseableHttpResponse * @throws IOException if setting up mock content fails */ private CloseableHttpResponse createMockResponse(int statusCode, String jsonContent) throws IOException { CloseableHttpResponse response = org.mockito.Mockito.mock(CloseableHttpResponse.class); StatusLine statusLine = org.mockito.Mockito.mock(StatusLine.class); when(response.getStatusLine()).thenReturn(statusLine); when(statusLine.getStatusCode()).thenReturn(statusCode); if (jsonContent != null) { HttpEntity entity = org.mockito.Mockito.mock(HttpEntity.class); when(response.getEntity()).thenReturn(entity); when(entity.getContent()).thenReturn(new ByteArrayInputStream(jsonContent.getBytes())); } else { when(response.getEntity()).thenReturn(null); } return response; } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/metalake/gravitino/GravitinoTableSchemaConvertorTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.metalake.gravitino; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.DecimalType; import org.apache.seatunnel.api.table.type.LocalTimeType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.PrimitiveByteArrayType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.List; public class GravitinoTableSchemaConvertorTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final GravitinoTableSchemaConvertor CONVERTOR = new GravitinoTableSchemaConvertor(); @Test void testBooleanType() throws Exception { String json = "{\"columns\":[{\"name\":\"bool_col\",\"type\":\"boolean\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("bool_col", column.getName()); Assertions.assertEquals(BasicType.BOOLEAN_TYPE, column.getDataType()); Assertions.assertTrue(column.isNullable()); } @Test void testByteType() throws Exception { String json = "{\"columns\":[{\"name\":\"byte_col\",\"type\":\"byte\",\"nullable\":false}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("byte_col", column.getName()); Assertions.assertEquals(BasicType.BYTE_TYPE, column.getDataType()); Assertions.assertFalse(column.isNullable()); } @Test void testByteUnsignedType() throws Exception { String json = "{\"columns\":[{\"name\":\"byte_unsigned_col\",\"type\":\"byte unsigned\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("byte_unsigned_col", column.getName()); Assertions.assertEquals(BasicType.BYTE_TYPE, column.getDataType()); } @Test void testShortType() throws Exception { String json = "{\"columns\":[{\"name\":\"short_col\",\"type\":\"short\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("short_col", column.getName()); Assertions.assertEquals(BasicType.SHORT_TYPE, column.getDataType()); } @Test void testShortUnsignedType() throws Exception { String json = "{\"columns\":[{\"name\":\"short_unsigned_col\",\"type\":\"short unsigned\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("short_unsigned_col", column.getName()); Assertions.assertEquals(BasicType.SHORT_TYPE, column.getDataType()); } @Test void testIntegerType() throws Exception { String json = "{\"columns\":[{\"name\":\"int_col\",\"type\":\"integer\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("int_col", column.getName()); Assertions.assertEquals(BasicType.INT_TYPE, column.getDataType()); } @Test void testIntegerUnsignedType() throws Exception { String json = "{\"columns\":[{\"name\":\"int_unsigned_col\",\"type\":\"integer unsigned\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("int_unsigned_col", column.getName()); Assertions.assertEquals(BasicType.INT_TYPE, column.getDataType()); } @Test void testLongType() throws Exception { String json = "{\"columns\":[{\"name\":\"long_col\",\"type\":\"long\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("long_col", column.getName()); Assertions.assertEquals(BasicType.LONG_TYPE, column.getDataType()); } @Test void testLongUnsignedType() throws Exception { String json = "{\"columns\":[{\"name\":\"long_unsigned_col\",\"type\":\"long unsigned\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("long_unsigned_col", column.getName()); Assertions.assertEquals(BasicType.LONG_TYPE, column.getDataType()); } @Test void testFloatType() throws Exception { String json = "{\"columns\":[{\"name\":\"float_col\",\"type\":\"float\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("float_col", column.getName()); Assertions.assertEquals(BasicType.FLOAT_TYPE, column.getDataType()); } @Test void testDoubleType() throws Exception { String json = "{\"columns\":[{\"name\":\"double_col\",\"type\":\"double\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("double_col", column.getName()); Assertions.assertEquals(BasicType.DOUBLE_TYPE, column.getDataType()); } @Test void testStringType() throws Exception { String json = "{\"columns\":[{\"name\":\"str_col\",\"type\":\"string\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("str_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); } @Test void testVarcharType() throws Exception { String json = "{\"columns\":[{\"name\":\"varchar_col\",\"type\":\"varchar(255)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("varchar_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); Assertions.assertEquals(Long.valueOf(255), column.getColumnLength()); } @Test void testCharType() throws Exception { String json = "{\"columns\":[{\"name\":\"char_col\",\"type\":\"char(10)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("char_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); Assertions.assertEquals(Long.valueOf(10), column.getColumnLength()); } @Test void testUuidType() throws Exception { String json = "{\"columns\":[{\"name\":\"uuid_col\",\"type\":\"uuid\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("uuid_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); } @Test void testIntervalYearType() throws Exception { String json = "{\"columns\":[{\"name\":\"interval_year_col\",\"type\":\"interval_year\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("interval_year_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); } @Test void testIntervalDayType() throws Exception { String json = "{\"columns\":[{\"name\":\"interval_day_col\",\"type\":\"interval_day\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("interval_day_col", column.getName()); Assertions.assertEquals(BasicType.STRING_TYPE, column.getDataType()); } @Test void testDateType() throws Exception { String json = "{\"columns\":[{\"name\":\"date_col\",\"type\":\"date\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("date_col", column.getName()); Assertions.assertEquals(LocalTimeType.LOCAL_DATE_TYPE, column.getDataType()); } @Test void testTimeType() throws Exception { String json = "{\"columns\":[{\"name\":\"time_col\",\"type\":\"time\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("time_col", column.getName()); Assertions.assertEquals(LocalTimeType.LOCAL_TIME_TYPE, column.getDataType()); } @Test void testTimestampType() throws Exception { String json = "{\"columns\":[{\"name\":\"timestamp_col\",\"type\":\"timestamp\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("timestamp_col", column.getName()); Assertions.assertEquals(LocalTimeType.LOCAL_DATE_TIME_TYPE, column.getDataType()); } @Test void testTimestampTzType() throws Exception { String json = "{\"columns\":[{\"name\":\"timestamp_tz_col\",\"type\":\"timestamp_tz\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("timestamp_tz_col", column.getName()); Assertions.assertEquals(LocalTimeType.OFFSET_DATE_TIME_TYPE, column.getDataType()); } @Test void testTimestampTypeWithPrecision() throws Exception { String json = "{\"columns\":[{\"name\":\"created_at\",\"type\":\"timestamp(6)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("created_at", column.getName()); Assertions.assertEquals(LocalTimeType.LOCAL_DATE_TIME_TYPE, column.getDataType()); Assertions.assertEquals(Long.valueOf(6), column.getColumnLength()); } @Test void testTimestampTzTypeWithPrecision() throws Exception { String json = "{\"columns\":[{\"name\":\"updated_at\",\"type\":\"timestamp_tz(6)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("updated_at", column.getName()); Assertions.assertEquals(LocalTimeType.OFFSET_DATE_TIME_TYPE, column.getDataType()); Assertions.assertEquals(Long.valueOf(6), column.getColumnLength()); } @Test void testBinaryType() throws Exception { String json = "{\"columns\":[{\"name\":\"binary_col\",\"type\":\"binary\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); Column column = columns.get(0); Assertions.assertEquals("binary_col", column.getName()); Assertions.assertEquals(PrimitiveByteArrayType.INSTANCE, column.getDataType()); } @Test void testFixedType() throws Exception { String json = "{\"columns\":[{\"name\":\"fixed_col\",\"type\":\"fixed(16)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("fixed_col", column.getName()); Assertions.assertEquals(PrimitiveByteArrayType.INSTANCE, column.getDataType()); Assertions.assertEquals(Long.valueOf(16), column.getColumnLength()); } @Test void testDecimalType() throws Exception { String json = "{\"columns\":[{\"name\":\"decimal_col\",\"type\":\"decimal(10,2)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("decimal_col", column.getName()); Assertions.assertEquals(new DecimalType(10, 2), column.getDataType()); Assertions.assertEquals(Integer.valueOf(2), column.getScale()); } @Test void testDecimalTypeWithDifferentPrecision() throws Exception { String json = "{\"columns\":[{\"name\":\"decimal_col\",\"type\":\"decimal(38,18)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("decimal_col", column.getName()); Assertions.assertEquals(new DecimalType(38, 18), column.getDataType()); Assertions.assertEquals(Integer.valueOf(18), column.getScale()); } @Test void testDecimalTypeUpperCase() throws Exception { String json = "{\"columns\":[{\"name\":\"decimal_col\",\"type\":\"DECIMAL(20,5)\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("decimal_col", column.getName()); Assertions.assertEquals(new DecimalType(20, 5), column.getDataType()); Assertions.assertEquals(Integer.valueOf(5), column.getScale()); } @Test void testDecimalTypeWithSpaces() throws Exception { String json = "{\"columns\":[{\"name\":\"decimal_col\",\"type\":\"decimal( 10 , 2 )\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); PhysicalColumn column = (PhysicalColumn) columns.get(0); Assertions.assertEquals("decimal_col", column.getName()); Assertions.assertEquals(new DecimalType(10, 2), column.getDataType()); Assertions.assertEquals(Integer.valueOf(2), column.getScale()); } @Test void testListTypeWithSimpleElementType() throws Exception { String json = "{\"columns\":[{\"name\":\"list_col\",\"type\":{\"type\":\"list\",\"elementType\":\"integer\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); ArrayType arrayType = (ArrayType) columns.get(0).getDataType(); Assertions.assertEquals("list_col", columns.get(0).getName()); Assertions.assertEquals(BasicType.INT_TYPE, arrayType.getElementType()); } @Test void testListTypeWithStringElementType() throws Exception { String json = "{\"columns\":[{\"name\":\"list_col\",\"type\":{\"type\":\"list\",\"elementType\":\"string\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); ArrayType arrayType = (ArrayType) columns.get(0).getDataType(); Assertions.assertEquals("list_col", columns.get(0).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, arrayType.getElementType()); } @Test void testListTypeWithDecimalElementType() throws Exception { String json = "{\"columns\":[{\"name\":\"list_col\",\"type\":{\"type\":\"list\",\"elementType\":\"decimal(10,2)\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); ArrayType arrayType = (ArrayType) columns.get(0).getDataType(); Assertions.assertEquals("list_col", columns.get(0).getName()); Assertions.assertEquals(new DecimalType(10, 2), arrayType.getElementType()); } @Test void testMapTypeWithStringKeyIntValue() throws Exception { String json = "{\"columns\":[{\"name\":\"map_col\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"integer\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); MapType mapType = (MapType) columns.get(0).getDataType(); Assertions.assertEquals("map_col", columns.get(0).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, mapType.getKeyType()); Assertions.assertEquals(BasicType.INT_TYPE, mapType.getValueType()); } @Test void testMapTypeWithIntKeyLongValue() throws Exception { String json = "{\"columns\":[{\"name\":\"map_col\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); MapType mapType = (MapType) columns.get(0).getDataType(); Assertions.assertEquals("map_col", columns.get(0).getName()); Assertions.assertEquals(BasicType.INT_TYPE, mapType.getKeyType()); Assertions.assertEquals(BasicType.LONG_TYPE, mapType.getValueType()); } @Test void testMapTypeWithComplexTypes() throws Exception { String json = "{\"columns\":[{\"name\":\"map_col\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"decimal(10,2)\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); MapType mapType = (MapType) columns.get(0).getDataType(); Assertions.assertEquals("map_col", columns.get(0).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, mapType.getKeyType()); Assertions.assertEquals(new DecimalType(10, 2), mapType.getValueType()); } @Test void testStructTypeSimple() throws Exception { String json = "{\"columns\":[{\"name\":\"struct_col\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true}]},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); SeaTunnelRowType rowType = (SeaTunnelRowType) columns.get(0).getDataType(); Assertions.assertEquals("struct_col", columns.get(0).getName()); Assertions.assertEquals(2, rowType.getTotalFields()); Assertions.assertEquals("id", rowType.getFieldName(0)); Assertions.assertEquals(BasicType.INT_TYPE, rowType.getFieldType(0)); Assertions.assertEquals("name", rowType.getFieldName(1)); Assertions.assertEquals(BasicType.STRING_TYPE, rowType.getFieldType(1)); } @Test void testStructTypeNested() throws Exception { String json = "{\"columns\":[{\"name\":\"struct_col\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"base\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true},{\"name\":\"flag\",\"type\":\"boolean\",\"nullable\":true}]},\"nullable\":true},{\"name\":\"ext\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"score\",\"type\":\"double\",\"nullable\":true}]},\"nullable\":true}]},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); SeaTunnelRowType rowType = (SeaTunnelRowType) columns.get(0).getDataType(); Assertions.assertEquals("struct_col", columns.get(0).getName()); Assertions.assertEquals(2, rowType.getTotalFields()); // Check base field (nested struct) Assertions.assertEquals("base", rowType.getFieldName(0)); SeaTunnelRowType baseType = (SeaTunnelRowType) rowType.getFieldType(0); Assertions.assertEquals(2, baseType.getTotalFields()); Assertions.assertEquals("id", baseType.getFieldName(0)); Assertions.assertEquals(BasicType.LONG_TYPE, baseType.getFieldType(0)); Assertions.assertEquals("flag", baseType.getFieldName(1)); Assertions.assertEquals(BasicType.BOOLEAN_TYPE, baseType.getFieldType(1)); // Check ext field (nested struct) Assertions.assertEquals("ext", rowType.getFieldName(1)); SeaTunnelRowType extType = (SeaTunnelRowType) rowType.getFieldType(1); Assertions.assertEquals(1, extType.getTotalFields()); Assertions.assertEquals("score", extType.getFieldName(0)); Assertions.assertEquals(BasicType.DOUBLE_TYPE, extType.getFieldType(0)); } @Test void testStructTypeWithComplexFields() throws Exception { String json = "{\"columns\":[{\"name\":\"struct_col\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"tags\",\"type\":{\"type\":\"list\",\"elementType\":\"string\"},\"nullable\":true},{\"name\":\"metadata\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\"},\"nullable\":true}]},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(1, columns.size()); SeaTunnelRowType rowType = (SeaTunnelRowType) columns.get(0).getDataType(); Assertions.assertEquals("struct_col", columns.get(0).getName()); Assertions.assertEquals(3, rowType.getTotalFields()); // Check id field Assertions.assertEquals("id", rowType.getFieldName(0)); Assertions.assertEquals(BasicType.INT_TYPE, rowType.getFieldType(0)); // Check tags field (array) Assertions.assertEquals("tags", rowType.getFieldName(1)); ArrayType tagsType = (ArrayType) rowType.getFieldType(1); Assertions.assertEquals(BasicType.STRING_TYPE, tagsType.getElementType()); // Check metadata field (map) Assertions.assertEquals("metadata", rowType.getFieldName(2)); MapType metadataType = (MapType) rowType.getFieldType(2); Assertions.assertEquals(BasicType.STRING_TYPE, metadataType.getKeyType()); Assertions.assertEquals(BasicType.STRING_TYPE, metadataType.getValueType()); } @Test void testStructWithoutFields() throws Exception { String json = "{\"columns\":[{\"name\":\"struct_col\",\"type\":{\"type\":\"struct\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("struct without fields array"), "Error message should mention missing fields"); Assertions.assertTrue(exception.getMessage().contains("struct_col")); } @Test void testUnsupportedUnionType() throws Exception { String json = "{\"columns\":[{\"name\":\"union_col\",\"type\":{\"type\":\"union\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("union"), "Error message should mention unsupported type 'union'"); Assertions.assertTrue(exception.getMessage().contains("union_col")); } @Test void testUnsupportedUnknownType() throws Exception { String json = "{\"columns\":[{\"name\":\"unknown_col\",\"type\":\"unsupported_type\",\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("unsupported_type"), "Error message should mention unsupported type 'unsupported_type'"); Assertions.assertTrue(exception.getMessage().contains("unknown_col")); } @Test void testListWithoutElementType() throws Exception { String json = "{\"columns\":[{\"name\":\"list_col\",\"type\":{\"type\":\"list\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("list without elementType"), "Error message should mention missing elementType"); Assertions.assertTrue(exception.getMessage().contains("list_col")); } @Test void testMapWithoutKeyOrValueType() throws Exception { String json = "{\"columns\":[{\"name\":\"map_col\",\"type\":{\"type\":\"map\",\"keyType\":\"string\"},\"nullable\":true}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("map without keyType or valueType"), "Error message should mention missing keyType or valueType"); Assertions.assertTrue(exception.getMessage().contains("map_col")); } @Test void testPrimaryKey() throws Exception { String json = "{\"columns\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true}]," + "\"indexes\":[{\"name\":\"pk\",\"indexType\":\"PRIMARY_KEY\",\"fieldNames\":[[\"id\"]]}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); PrimaryKey primaryKey = schema.getPrimaryKey(); Assertions.assertNotNull(primaryKey); Assertions.assertEquals("pk", primaryKey.getPrimaryKey()); Assertions.assertEquals(1, primaryKey.getColumnNames().size()); Assertions.assertEquals("id", primaryKey.getColumnNames().get(0)); } @Test void testPrimaryKeyWithMultipleColumns() throws Exception { String json = "{\"columns\":[{\"name\":\"id1\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"id2\",\"type\":\"string\",\"nullable\":false},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true}]," + "\"indexes\":[{\"name\":\"pk\",\"indexType\":\"PRIMARY_KEY\",\"fieldNames\":[[\"id1\"],[\"id2\"]]}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); PrimaryKey primaryKey = schema.getPrimaryKey(); Assertions.assertNotNull(primaryKey); Assertions.assertEquals("pk", primaryKey.getPrimaryKey()); Assertions.assertEquals(2, primaryKey.getColumnNames().size()); Assertions.assertEquals("id1", primaryKey.getColumnNames().get(0)); Assertions.assertEquals("id2", primaryKey.getColumnNames().get(1)); } @Test void testUniqueKey() throws Exception { String json = "{\"columns\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"email\",\"type\":\"string\",\"nullable\":true}]," + "\"indexes\":[{\"name\":\"uk_email\",\"indexType\":\"UNIQUE_KEY\",\"fieldNames\":[[\"email\"]]}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List constraintKeys = schema.getConstraintKeys(); Assertions.assertEquals(1, constraintKeys.size()); ConstraintKey uniqueKey = constraintKeys.get(0); Assertions.assertEquals("uk_email", uniqueKey.getConstraintName()); Assertions.assertEquals( ConstraintKey.ConstraintType.UNIQUE_KEY, uniqueKey.getConstraintType()); Assertions.assertEquals(1, uniqueKey.getColumnNames().size()); Assertions.assertEquals("email", uniqueKey.getColumnNames().get(0).getColumnName()); } @Test void testMultipleUniqueKeys() throws Exception { String json = "{\"columns\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"email\",\"type\":\"string\",\"nullable\":true},{\"name\":\"username\",\"type\":\"string\",\"nullable\":true}]," + "\"indexes\":[{\"name\":\"uk_email\",\"indexType\":\"UNIQUE_KEY\",\"fieldNames\":[[\"email\"]]},{\"name\":\"uk_username\",\"indexType\":\"UNIQUE_KEY\",\"fieldNames\":[[\"username\"]]}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List constraintKeys = schema.getConstraintKeys(); Assertions.assertEquals(2, constraintKeys.size()); Assertions.assertEquals("uk_email", constraintKeys.get(0).getConstraintName()); Assertions.assertEquals("uk_username", constraintKeys.get(1).getConstraintName()); } @Test void testPrimaryKeyAndUniqueKey() throws Exception { String json = "{\"columns\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false},{\"name\":\"email\",\"type\":\"string\",\"nullable\":true},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true}]," + "\"indexes\":[{\"name\":\"pk\",\"indexType\":\"PRIMARY_KEY\",\"fieldNames\":[[\"id\"]]},{\"name\":\"uk_email\",\"indexType\":\"UNIQUE_KEY\",\"fieldNames\":[[\"email\"]]}]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); PrimaryKey primaryKey = schema.getPrimaryKey(); Assertions.assertNotNull(primaryKey); Assertions.assertEquals("pk", primaryKey.getPrimaryKey()); List constraintKeys = schema.getConstraintKeys(); Assertions.assertEquals(1, constraintKeys.size()); Assertions.assertEquals("uk_email", constraintKeys.get(0).getConstraintName()); } @Test void testEmptyColumns() throws Exception { String json = "{\"columns\":[]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> CONVERTOR.convertor(metaInfo)); Assertions.assertTrue( exception.getMessage().contains("columns"), "Error message should mention empty columns"); } @Test void testNoColumnsField() throws Exception { String json = "{\"indexes\":[]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertTrue(columns.isEmpty()); } @Test void testCaseInsensitiveTypeMatching() throws Exception { String json = "{\"columns\":[" + "{\"name\":\"col1\",\"type\":\"BOOLEAN\",\"nullable\":true}," + "{\"name\":\"col2\",\"type\":\"INTEGER\",\"nullable\":true}," + "{\"name\":\"col3\",\"type\":\"STRING\",\"nullable\":true}," + "{\"name\":\"col4\",\"type\":\"DOUBLE\",\"nullable\":true}" + "]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(4, columns.size()); Assertions.assertEquals(BasicType.BOOLEAN_TYPE, columns.get(0).getDataType()); Assertions.assertEquals(BasicType.INT_TYPE, columns.get(1).getDataType()); Assertions.assertEquals(BasicType.STRING_TYPE, columns.get(2).getDataType()); Assertions.assertEquals(BasicType.DOUBLE_TYPE, columns.get(3).getDataType()); } @Test void testMixedCaseTypeWithParameters() throws Exception { String json = "{\"columns\":[" + "{\"name\":\"col1\",\"type\":\"VARCHAR(100)\",\"nullable\":true}," + "{\"name\":\"col2\",\"type\":\"CHAR(10)\",\"nullable\":true}," + "{\"name\":\"col3\",\"type\":\"DECIMAL(20,5)\",\"nullable\":true}," + "{\"name\":\"col4\",\"type\":\"Fixed(8)\",\"nullable\":true}" + "]}"; JsonNode metaInfo = OBJECT_MAPPER.readTree(json); TableSchema schema = CONVERTOR.convertor(metaInfo); List columns = schema.getColumns(); Assertions.assertEquals(4, columns.size()); PhysicalColumn col1 = (PhysicalColumn) columns.get(0); Assertions.assertEquals(BasicType.STRING_TYPE, col1.getDataType()); Assertions.assertEquals(Long.valueOf(100), col1.getColumnLength()); PhysicalColumn col2 = (PhysicalColumn) columns.get(1); Assertions.assertEquals(BasicType.STRING_TYPE, col2.getDataType()); Assertions.assertEquals(Long.valueOf(10), col2.getColumnLength()); PhysicalColumn col3 = (PhysicalColumn) columns.get(2); Assertions.assertEquals(new DecimalType(20, 5), col3.getDataType()); Assertions.assertEquals(Integer.valueOf(5), col3.getScale()); PhysicalColumn col4 = (PhysicalColumn) columns.get(3); Assertions.assertEquals(PrimitiveByteArrayType.INSTANCE, col4.getDataType()); Assertions.assertEquals(Long.valueOf(8), col4.getColumnLength()); } @Test void testBuildCatalogTableWithHiveMetadata() throws Exception { // Read metadata from JSON file String jsonPath = "/conf/json/metadata_json_from_meta_lake_hive.json"; JsonNode rootNode = OBJECT_MAPPER.readTree(getClass().getResourceAsStream(jsonPath)); JsonNode tableNode = rootNode.get("table"); // Convert metadata to TableSchema TableSchema tableSchema = CONVERTOR.convertor(tableNode); // Verify columns List columns = tableSchema.getColumns(); Assertions.assertEquals(20, columns.size()); // Verify basic types Assertions.assertEquals("c_tinyint", columns.get(0).getName()); Assertions.assertEquals(BasicType.BYTE_TYPE, columns.get(0).getDataType()); Assertions.assertEquals("c_smallint", columns.get(1).getName()); Assertions.assertEquals(BasicType.SHORT_TYPE, columns.get(1).getDataType()); Assertions.assertEquals("c_int", columns.get(2).getName()); Assertions.assertEquals(BasicType.INT_TYPE, columns.get(2).getDataType()); Assertions.assertEquals("c_bigint", columns.get(3).getName()); Assertions.assertEquals(BasicType.LONG_TYPE, columns.get(3).getDataType()); // Verify decimal type Assertions.assertEquals("c_decimal", columns.get(7).getName()); Assertions.assertEquals(new DecimalType(20, 6), columns.get(7).getDataType()); // Verify array types ArrayType arrayIntType = (ArrayType) columns.get(14).getDataType(); Assertions.assertEquals("c_array_int", columns.get(14).getName()); Assertions.assertEquals(BasicType.INT_TYPE, arrayIntType.getElementType()); ArrayType arrayStringType = (ArrayType) columns.get(15).getDataType(); Assertions.assertEquals("c_array_string", columns.get(15).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, arrayStringType.getElementType()); // Verify map types MapType mapStrIntType = (MapType) columns.get(16).getDataType(); Assertions.assertEquals("c_map_str_int", columns.get(16).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, mapStrIntType.getKeyType()); Assertions.assertEquals(BasicType.INT_TYPE, mapStrIntType.getValueType()); // Verify struct type - simple struct SeaTunnelRowType simpleStructType = (SeaTunnelRowType) columns.get(18).getDataType(); Assertions.assertEquals("c_struct_simple", columns.get(18).getName()); Assertions.assertEquals(2, simpleStructType.getTotalFields()); Assertions.assertEquals("id", simpleStructType.getFieldName(0)); Assertions.assertEquals(BasicType.INT_TYPE, simpleStructType.getFieldType(0)); Assertions.assertEquals("name", simpleStructType.getFieldName(1)); Assertions.assertEquals(BasicType.STRING_TYPE, simpleStructType.getFieldType(1)); // Verify struct type - nested struct SeaTunnelRowType nestedStructType = (SeaTunnelRowType) columns.get(19).getDataType(); Assertions.assertEquals("c_struct_nested", columns.get(19).getName()); Assertions.assertEquals(2, nestedStructType.getTotalFields()); // Check base field (nested struct) SeaTunnelRowType baseStruct = (SeaTunnelRowType) nestedStructType.getFieldType(0); Assertions.assertEquals("base", nestedStructType.getFieldName(0)); Assertions.assertEquals(2, baseStruct.getTotalFields()); Assertions.assertEquals("id", baseStruct.getFieldName(0)); Assertions.assertEquals(BasicType.LONG_TYPE, baseStruct.getFieldType(0)); Assertions.assertEquals("flag", baseStruct.getFieldName(1)); Assertions.assertEquals(BasicType.BOOLEAN_TYPE, baseStruct.getFieldType(1)); // Check ext field (nested struct with list) SeaTunnelRowType extStruct = (SeaTunnelRowType) nestedStructType.getFieldType(1); Assertions.assertEquals("ext", nestedStructType.getFieldName(1)); Assertions.assertEquals(2, extStruct.getTotalFields()); Assertions.assertEquals("score", extStruct.getFieldName(0)); Assertions.assertEquals(BasicType.DOUBLE_TYPE, extStruct.getFieldType(0)); Assertions.assertEquals("tags", extStruct.getFieldName(1)); ArrayType tagsArrayType = (ArrayType) extStruct.getFieldType(1); Assertions.assertEquals(BasicType.STRING_TYPE, tagsArrayType.getElementType()); // Build CatalogTable TablePath tablePath = TablePath.of("test_db", "test_schema", "all_hive_types_csv"); CatalogTable catalogTable = CONVERTOR.buildCatalogTable("hive_catalog", tablePath, tableSchema); // Verify CatalogTable properties Assertions.assertEquals("hive_catalog", catalogTable.getCatalogName()); Assertions.assertEquals("hive_catalog", catalogTable.getTableId().getCatalogName()); Assertions.assertEquals("test_db", catalogTable.getTableId().getDatabaseName()); Assertions.assertEquals("test_schema", catalogTable.getTableId().getSchemaName()); Assertions.assertEquals("all_hive_types_csv", catalogTable.getTableId().getTableName()); Assertions.assertEquals(tableSchema, catalogTable.getTableSchema()); } @Test void testBuildCatalogTableWithPostgresMetadata() throws Exception { // Read metadata from JSON file String jsonPath = "/conf/json/metadata_json_from_meta_lake_pgsql.json"; JsonNode rootNode = OBJECT_MAPPER.readTree(getClass().getResourceAsStream(jsonPath)); JsonNode tableNode = rootNode.get("table"); // Convert metadata to TableSchema TableSchema tableSchema = CONVERTOR.convertor(tableNode); // Verify columns List columns = tableSchema.getColumns(); Assertions.assertEquals(14, columns.size()); // Verify primary key PrimaryKey primaryKey = tableSchema.getPrimaryKey(); Assertions.assertNotNull(primaryKey); Assertions.assertEquals("all_type_pk", primaryKey.getPrimaryKey()); Assertions.assertEquals(1, primaryKey.getColumnNames().size()); Assertions.assertEquals("id", primaryKey.getColumnNames().get(0)); // Verify unique keys List constraintKeys = tableSchema.getConstraintKeys(); Assertions.assertEquals(1, constraintKeys.size()); Assertions.assertEquals( "all_type_big_number_idx", constraintKeys.get(0).getConstraintName()); Assertions.assertEquals( ConstraintKey.ConstraintType.UNIQUE_KEY, constraintKeys.get(0).getConstraintType()); // Verify basic column types Assertions.assertEquals("id", columns.get(0).getName()); Assertions.assertEquals(BasicType.INT_TYPE, columns.get(0).getDataType()); Assertions.assertFalse(columns.get(0).isNullable()); Assertions.assertEquals("big_number", columns.get(1).getName()); Assertions.assertEquals(BasicType.LONG_TYPE, columns.get(1).getDataType()); Assertions.assertEquals("decimal_value", columns.get(6).getName()); Assertions.assertEquals(new DecimalType(10, 2), columns.get(6).getDataType()); // Verify varchar types with length Assertions.assertEquals("user_name", columns.get(8).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, columns.get(8).getDataType()); Assertions.assertEquals( Long.valueOf(300), ((PhysicalColumn) columns.get(8)).getColumnLength()); // Verify external type (jsonb treated as string) Assertions.assertEquals("map_field", columns.get(12).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, columns.get(12).getDataType()); // Verify list type ArrayType listFieldType = (ArrayType) columns.get(13).getDataType(); Assertions.assertEquals("list_field", columns.get(13).getName()); Assertions.assertEquals(BasicType.STRING_TYPE, listFieldType.getElementType()); // Build CatalogTable TablePath tablePath = TablePath.of("test_db", "public", "all_type"); CatalogTable catalogTable = CONVERTOR.buildCatalogTable("postgres_catalog", tablePath, tableSchema); // Verify CatalogTable properties Assertions.assertEquals("postgres_catalog", catalogTable.getCatalogName()); Assertions.assertEquals("postgres_catalog", catalogTable.getTableId().getCatalogName()); Assertions.assertEquals("test_db", catalogTable.getTableId().getDatabaseName()); Assertions.assertEquals("public", catalogTable.getTableId().getSchemaName()); Assertions.assertEquals("all_type", catalogTable.getTableId().getTableName()); Assertions.assertEquals(tableSchema, catalogTable.getTableSchema()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/sink/DefaultSaveModeHandlerTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.table.catalog.Catalog; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; import org.apache.seatunnel.api.table.catalog.InMemoryCatalog; import org.apache.seatunnel.api.table.catalog.InMemoryCatalogFactory; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; public class DefaultSaveModeHandlerTest { private SeaTunnelRowType rowType; private InMemoryCatalogFactory catalogFactory; @BeforeEach public void setup() { String[] fieldNames = new String[] {"id", "name", "description", "weight"}; SeaTunnelDataType[] dataTypes = new SeaTunnelDataType[] { BasicType.LONG_TYPE, BasicType.STRING_TYPE, BasicType.STRING_TYPE, BasicType.STRING_TYPE }; rowType = new SeaTunnelRowType(fieldNames, dataTypes); catalogFactory = new InMemoryCatalogFactory(); } @Test public void shouldTruncateExistingTable() { // SchemaSaveMode is CREATE_SCHEMA_WHEN_NOT_EXIST and DataSaveMode is DROP_DATA and table // exist, truncateTable needs to be executed CatalogTable catalogTable = createCatalogTable("table1"); Catalog catalog = catalogFactory.createCatalog("test", null); DefaultSaveModeHandler handler = createHandler( SchemaSaveMode.CREATE_SCHEMA_WHEN_NOT_EXIST, DataSaveMode.DROP_DATA, catalog, catalogTable); handler.handleSchemaSaveMode(); handler.handleDataSaveMode(); InMemoryCatalog inMemoryCatalog = (InMemoryCatalog) catalog; assertTrue(inMemoryCatalog.isRunTruncateTable(), "Should truncate data for existing table"); } @Test public void shouldNotTruncateNewlyCreatedTable() { // SchemaSaveMode is CREATE_SCHEMA_WHEN_NOT_EXIST and DataSaveMode is DROP_DATA and table // not exist, truncateTable no needs to be executed CatalogTable catalogTable = createCatalogTable("notExistsTable"); Catalog catalog = catalogFactory.createCatalog("test", null); DefaultSaveModeHandler handler = createHandler( SchemaSaveMode.CREATE_SCHEMA_WHEN_NOT_EXIST, DataSaveMode.DROP_DATA, catalog, catalogTable); handler.handleSchemaSaveMode(); handler.handleDataSaveMode(); InMemoryCatalog inMemoryCatalog = (InMemoryCatalog) catalog; assertFalse( inMemoryCatalog.isRunTruncateTable(), "Should not truncate data for newly created table"); } @Test public void shouldNotTruncateRecreatedTable() { // SchemaSaveMode is RECREATE_SCHEMA and DataSaveMode is DROP_DATA , truncateTable no needs // to be executed CatalogTable catalogTable = createCatalogTable("notExistsTable"); Catalog catalog = catalogFactory.createCatalog("test", null); DefaultSaveModeHandler handler = createHandler( SchemaSaveMode.RECREATE_SCHEMA, DataSaveMode.DROP_DATA, catalog, catalogTable); handler.handleSchemaSaveMode(); handler.handleDataSaveMode(); InMemoryCatalog inMemoryCatalog = (InMemoryCatalog) catalog; assertFalse( inMemoryCatalog.isRunTruncateTable(), "Should not truncate data for recreated table"); } @Test public void handlesErrorWhenSchemaNotExist() { Catalog catalog = mock(Catalog.class); CatalogTable catalogTable = createCatalogTable("notExistsTable"); when(catalog.tableExists(any(TablePath.class))).thenReturn(false); DefaultSaveModeHandler handler = new DefaultSaveModeHandler( SchemaSaveMode.ERROR_WHEN_SCHEMA_NOT_EXIST, DataSaveMode.APPEND_DATA, catalog, catalogTable, null); assertThrows(SeaTunnelRuntimeException.class, handler::handleSchemaSaveModeWithRestore); } @Test public void createsSchemaWhenNotExist() { CatalogTable catalogTable = createCatalogTable("notExistsTable"); Catalog catalog = mock(Catalog.class); when(catalog.tableExists(any(TablePath.class))).thenReturn(false); DefaultSaveModeHandler handler = new DefaultSaveModeHandler( SchemaSaveMode.CREATE_SCHEMA_WHEN_NOT_EXIST, DataSaveMode.APPEND_DATA, catalog, catalogTable, null); handler.handleSchemaSaveModeWithRestore(); verify(catalog, times(1)) .createTable(any(TablePath.class), any(CatalogTable.class), eq(true)); } @Test public void recreatesSchemaWhenNotExist() { CatalogTable catalogTable = createCatalogTable("notExistsTable"); Catalog catalog = mock(Catalog.class); when(catalog.tableExists(any(TablePath.class))).thenReturn(false); DefaultSaveModeHandler handler = new DefaultSaveModeHandler( SchemaSaveMode.RECREATE_SCHEMA, DataSaveMode.APPEND_DATA, catalog, catalogTable, null); handler.handleSchemaSaveModeWithRestore(); verify(catalog, times(1)) .createTable(any(TablePath.class), any(CatalogTable.class), eq(true)); } private CatalogTable createCatalogTable(String tableName) { return CatalogTableUtil.getCatalogTable("", "st", "public", tableName, rowType); } private DefaultSaveModeHandler createHandler( SchemaSaveMode schemaSaveMode, DataSaveMode dataSaveMode, Catalog catalog, CatalogTable catalogTable) { return new DefaultSaveModeHandler( schemaSaveMode, dataSaveMode, catalog, catalogTable, null); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/sink/TablePlaceholderProcessorTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.CatalogTable; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.type.BasicType; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; public class TablePlaceholderProcessorTest { private static final Option DATABASE = Options.key("database").stringType().noDefaultValue(); private static final Option SCHEMA = Options.key("schema").stringType().noDefaultValue(); private static final Option TABLE = Options.key("table").stringType().noDefaultValue(); private static final Option PRIMARY_KEY = Options.key("primary_key").stringType().noDefaultValue(); private static final Option> PRIMARY_KEY_ARRAY = Options.key("primary_key_array").listType(String.class).noDefaultValue(); private static final Option UNIQUE_KEY = Options.key("unique_key").stringType().noDefaultValue(); private static final Option> UNIQUE_KEY_ARRAY = Options.key("unique_key_array").listType(String.class).noDefaultValue(); private static final Option FIELD_NAMES = Options.key("field_names").stringType().noDefaultValue(); private static final Option> FIELD_NAMES_ARRAY = Options.key("field_names_array").listType(String.class).noDefaultValue(); private static final Option PARTITION_KEYS = Options.key("partition_keys").stringType().noDefaultValue(); private static final Option> PARTITION_KEYS_ARRAY = Options.key("partition_keys_array").listType(String.class).noDefaultValue(); @Test public void testSinkOptions() { ReadonlyConfig config = createConfig(); CatalogTable table = createTestTable(); ReadonlyConfig newConfig = TablePlaceholderProcessor.replaceTablePlaceholder(config, table); Assertions.assertEquals("xyz_my-database_test", newConfig.get(DATABASE)); Assertions.assertEquals("xyz_my-schema_test", newConfig.get(SCHEMA)); Assertions.assertEquals("xyz_my-table_test", newConfig.get(TABLE)); Assertions.assertEquals("f1,f2", newConfig.get(PRIMARY_KEY)); Assertions.assertEquals("f3,f4", newConfig.get(UNIQUE_KEY)); Assertions.assertEquals("f1,f2,f3,f4,f5", newConfig.get(FIELD_NAMES)); Assertions.assertEquals("bucket(f1, 16),dt", newConfig.get(PARTITION_KEYS)); Assertions.assertEquals(Arrays.asList("f1", "f2"), newConfig.get(PRIMARY_KEY_ARRAY)); Assertions.assertEquals(Arrays.asList("f3", "f4"), newConfig.get(UNIQUE_KEY_ARRAY)); Assertions.assertEquals( Arrays.asList("f1", "f2", "f3", "f4", "f5"), newConfig.get(FIELD_NAMES_ARRAY)); Assertions.assertEquals( Arrays.asList("bucket(f1, 16)", "dt"), newConfig.get(PARTITION_KEYS_ARRAY)); } @Test public void testPartitionKeysPlaceholderWithEmptyPartitionKeys() { ReadonlyConfig config = createConfig(); CatalogTable table = createTestTable(); table.getPartitionKeys().clear(); ReadonlyConfig newConfig = TablePlaceholderProcessor.replaceTablePlaceholder(config, table); Assertions.assertEquals("${partition_keys}", newConfig.get(PARTITION_KEYS)); Assertions.assertEquals( Arrays.asList("${partition_keys}"), newConfig.get(PARTITION_KEYS_ARRAY)); } @Test public void testSinkOptionsWithNoTablePath() { ReadonlyConfig config = createConfig(); CatalogTable table = createTestTableWithNoDatabaseAndSchemaName(); ReadonlyConfig newConfig = TablePlaceholderProcessor.replaceTablePlaceholder(config, table); Assertions.assertEquals("xyz_default_db_test", newConfig.get(DATABASE)); Assertions.assertEquals("xyz_default_schema_test", newConfig.get(SCHEMA)); Assertions.assertEquals("xyz_default_table_test", newConfig.get(TABLE)); Assertions.assertEquals("f1,f2", newConfig.get(PRIMARY_KEY)); Assertions.assertEquals("f3,f4", newConfig.get(UNIQUE_KEY)); Assertions.assertEquals("f1,f2,f3,f4,f5", newConfig.get(FIELD_NAMES)); Assertions.assertEquals("bucket(f1, 16),dt", newConfig.get(PARTITION_KEYS)); Assertions.assertEquals(Arrays.asList("f1", "f2"), newConfig.get(PRIMARY_KEY_ARRAY)); Assertions.assertEquals(Arrays.asList("f3", "f4"), newConfig.get(UNIQUE_KEY_ARRAY)); Assertions.assertEquals( Arrays.asList("f1", "f2", "f3", "f4", "f5"), newConfig.get(FIELD_NAMES_ARRAY)); Assertions.assertEquals( Arrays.asList("bucket(f1, 16)", "dt"), newConfig.get(PARTITION_KEYS_ARRAY)); } @Test public void testSinkOptionsWithExcludeKeys() { ReadonlyConfig config = createConfig(); CatalogTable table = createTestTableWithNoDatabaseAndSchemaName(); ReadonlyConfig newConfig = TablePlaceholderProcessor.replaceTablePlaceholder( config, table, Arrays.asList(DATABASE.key())); Assertions.assertEquals("xyz_${database_name: default_db}_test", newConfig.get(DATABASE)); Assertions.assertEquals("xyz_default_schema_test", newConfig.get(SCHEMA)); Assertions.assertEquals("xyz_default_table_test", newConfig.get(TABLE)); Assertions.assertEquals("f1,f2", newConfig.get(PRIMARY_KEY)); Assertions.assertEquals("f3,f4", newConfig.get(UNIQUE_KEY)); Assertions.assertEquals("f1,f2,f3,f4,f5", newConfig.get(FIELD_NAMES)); Assertions.assertEquals(Arrays.asList("f1", "f2"), newConfig.get(PRIMARY_KEY_ARRAY)); Assertions.assertEquals(Arrays.asList("f3", "f4"), newConfig.get(UNIQUE_KEY_ARRAY)); Assertions.assertEquals( Arrays.asList("f1", "f2", "f3", "f4", "f5"), newConfig.get(FIELD_NAMES_ARRAY)); } @Test public void testSinkOptionsWithMultiTable() { ReadonlyConfig config = createConfig(); CatalogTable table1 = createTestTable(); CatalogTable table2 = createTestTableWithNoDatabaseAndSchemaName(); ReadonlyConfig newConfig1 = TablePlaceholderProcessor.replaceTablePlaceholder(config, table1, Arrays.asList()); ReadonlyConfig newConfig2 = TablePlaceholderProcessor.replaceTablePlaceholder(config, table2, Arrays.asList()); Assertions.assertEquals("xyz_my-database_test", newConfig1.get(DATABASE)); Assertions.assertEquals("xyz_my-schema_test", newConfig1.get(SCHEMA)); Assertions.assertEquals("xyz_my-table_test", newConfig1.get(TABLE)); Assertions.assertEquals("f1,f2", newConfig1.get(PRIMARY_KEY)); Assertions.assertEquals("f3,f4", newConfig1.get(UNIQUE_KEY)); Assertions.assertEquals("f1,f2,f3,f4,f5", newConfig1.get(FIELD_NAMES)); Assertions.assertEquals("bucket(f1, 16),dt", newConfig1.get(PARTITION_KEYS)); Assertions.assertEquals(Arrays.asList("f1", "f2"), newConfig1.get(PRIMARY_KEY_ARRAY)); Assertions.assertEquals(Arrays.asList("f3", "f4"), newConfig1.get(UNIQUE_KEY_ARRAY)); Assertions.assertEquals( Arrays.asList("f1", "f2", "f3", "f4", "f5"), newConfig1.get(FIELD_NAMES_ARRAY)); Assertions.assertEquals( Arrays.asList("bucket(f1, 16)", "dt"), newConfig1.get(PARTITION_KEYS_ARRAY)); Assertions.assertEquals("xyz_default_db_test", newConfig2.get(DATABASE)); Assertions.assertEquals("xyz_default_schema_test", newConfig2.get(SCHEMA)); Assertions.assertEquals("xyz_default_table_test", newConfig2.get(TABLE)); Assertions.assertEquals("f1,f2", newConfig2.get(PRIMARY_KEY)); Assertions.assertEquals("f3,f4", newConfig2.get(UNIQUE_KEY)); Assertions.assertEquals("f1,f2,f3,f4,f5", newConfig2.get(FIELD_NAMES)); Assertions.assertEquals("bucket(f1, 16),dt", newConfig2.get(PARTITION_KEYS)); Assertions.assertEquals(Arrays.asList("f1", "f2"), newConfig2.get(PRIMARY_KEY_ARRAY)); Assertions.assertEquals(Arrays.asList("f3", "f4"), newConfig2.get(UNIQUE_KEY_ARRAY)); Assertions.assertEquals( Arrays.asList("f1", "f2", "f3", "f4", "f5"), newConfig2.get(FIELD_NAMES_ARRAY)); Assertions.assertEquals( Arrays.asList("bucket(f1, 16)", "dt"), newConfig2.get(PARTITION_KEYS_ARRAY)); } private static ReadonlyConfig createConfig() { Map configMap = new HashMap<>(); configMap.put(DATABASE.key(), "xyz_${database_name: default_db}_test"); configMap.put(SCHEMA.key(), "xyz_${schema_name: default_schema}_test"); configMap.put(TABLE.key(), "xyz_${table_name: default_table}_test"); configMap.put(PRIMARY_KEY.key(), "${primary_key}"); configMap.put(UNIQUE_KEY.key(), "${unique_key}"); configMap.put(FIELD_NAMES.key(), "${field_names}"); configMap.put(PARTITION_KEYS.key(), "${partition_keys}"); configMap.put(PRIMARY_KEY_ARRAY.key(), Arrays.asList("${primary_key}")); configMap.put(UNIQUE_KEY_ARRAY.key(), Arrays.asList("${unique_key}")); configMap.put(FIELD_NAMES_ARRAY.key(), Arrays.asList("${field_names}")); configMap.put(PARTITION_KEYS_ARRAY.key(), Arrays.asList("${partition_keys}")); return ReadonlyConfig.fromMap(configMap); } private static CatalogTable createTestTableWithNoDatabaseAndSchemaName() { TableIdentifier tableId = TableIdentifier.of("my-catalog", null, null, "default_table"); TableSchema tableSchema = TableSchema.builder() .primaryKey(PrimaryKey.of("my-pk", Arrays.asList("f1", "f2"))) .constraintKey( ConstraintKey.of( ConstraintKey.ConstraintType.UNIQUE_KEY, "my-uk", Arrays.asList( ConstraintKey.ConstraintKeyColumn.of( "f3", ConstraintKey.ColumnSortType.ASC), ConstraintKey.ConstraintKeyColumn.of( "f4", ConstraintKey.ColumnSortType.ASC)))) .column( PhysicalColumn.builder() .name("f1") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f2") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f3") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f4") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f5") .dataType(BasicType.STRING_TYPE) .build()) .build(); return CatalogTable.of( tableId, tableSchema, Collections.emptyMap(), Arrays.asList("bucket(f1, 16)", "dt"), null); } private static CatalogTable createTestTable() { TableIdentifier tableId = TableIdentifier.of("my-catalog", "my-database", "my-schema", "my-table"); TableSchema tableSchema = TableSchema.builder() .primaryKey(PrimaryKey.of("my-pk", Arrays.asList("f1", "f2"))) .constraintKey( ConstraintKey.of( ConstraintKey.ConstraintType.UNIQUE_KEY, "my-uk", Arrays.asList( ConstraintKey.ConstraintKeyColumn.of( "f3", ConstraintKey.ColumnSortType.ASC), ConstraintKey.ConstraintKeyColumn.of( "f4", ConstraintKey.ColumnSortType.ASC)))) .column( PhysicalColumn.builder() .name("f1") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f2") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f3") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f4") .dataType(BasicType.STRING_TYPE) .build()) .column( PhysicalColumn.builder() .name("f5") .dataType(BasicType.STRING_TYPE) .build()) .build(); return CatalogTable.of( tableId, tableSchema, Collections.emptyMap(), Arrays.asList("bucket(f1, 16)", "dt"), null); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkAggregatedCommitterTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; public class MultiTableSinkAggregatedCommitterTest { @Test void testInitBeInvoked() throws IOException { Map> aggCommitters = new HashMap<>(); List methodInvoked = new ArrayList<>(); aggCommitters.put( "table1", new SinkAggregatedCommitter() { @Override public void init() { methodInvoked.add("init"); } @Override public List commit(List aggregatedCommitInfo) throws IOException { return Collections.emptyList(); } @Override public Object combine(List commitInfos) { return null; } @Override public void abort(List aggregatedCommitInfo) throws Exception {} @Override public void close() throws IOException { methodInvoked.add("close"); } }); MultiTableSinkAggregatedCommitter committer = new MultiTableSinkAggregatedCommitter(aggCommitters); committer.init(); committer.close(); Assertions.assertIterableEquals(Arrays.asList("init", "close"), methodInvoked); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkCommitterTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.sink.SinkCommitter; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; class MultiTableSinkCommitterTest { @Test void testRouteByTableIdentifierForCommitAndAbort() throws IOException { String table1 = "catalog.db.table1"; String table2 = "catalog.db.table2"; RecordingSinkCommitter table1Committer = new RecordingSinkCommitter(); RecordingSinkCommitter table2Committer = new RecordingSinkCommitter(); Map> sinkCommitters = new HashMap<>(); sinkCommitters.put(table1, table1Committer); sinkCommitters.put(table2, table2Committer); MultiTableSinkCommitter multiTableSinkCommitter = new MultiTableSinkCommitter(sinkCommitters); MultiTableCommitInfo commitInfo1 = new MultiTableCommitInfo(new ConcurrentHashMap<>()); commitInfo1.getCommitInfo().put(SinkIdentifier.of(table1, 0), "t1-c0"); commitInfo1.getCommitInfo().put(SinkIdentifier.of(table2, 0), "t2-c0"); MultiTableCommitInfo commitInfo2 = new MultiTableCommitInfo(new ConcurrentHashMap<>()); commitInfo2.getCommitInfo().put(SinkIdentifier.of(table1, 1), "t1-c1"); commitInfo2.getCommitInfo().put(SinkIdentifier.of(table2, 1), "t2-c1"); List allCommitInfos = Arrays.asList(commitInfo1, commitInfo2); multiTableSinkCommitter.commit(allCommitInfos); Assertions.assertIterableEquals(Arrays.asList("t1-c0", "t1-c1"), table1Committer.committed); Assertions.assertIterableEquals(Arrays.asList("t2-c0", "t2-c1"), table2Committer.committed); multiTableSinkCommitter.abort(allCommitInfos); Assertions.assertIterableEquals(Arrays.asList("t1-c0", "t1-c1"), table1Committer.aborted); Assertions.assertIterableEquals(Arrays.asList("t2-c0", "t2-c1"), table2Committer.aborted); } private static class RecordingSinkCommitter implements SinkCommitter { private List committed = Collections.emptyList(); private List aborted = Collections.emptyList(); @Override public List commit(List commitInfos) { this.committed = commitInfos; return Collections.emptyList(); } @Override public void abort(List commitInfos) { this.aborted = commitInfos; } } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/sink/multitablesink/MultiTableSinkWriterTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.sink.multitablesink; import org.apache.seatunnel.api.common.metrics.MetricsContext; import org.apache.seatunnel.api.event.DefaultEventProcessor; import org.apache.seatunnel.api.event.EventListener; import org.apache.seatunnel.api.serialization.DefaultSerializer; import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.sink.SupportMultiTableSinkWriter; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.junit.jupiter.api.Test; import lombok.AllArgsConstructor; import lombok.Data; import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; public class MultiTableSinkWriterTest { @Test public void testPrepareCommitState() throws IOException { int threads = 50; Map> sinkWriters = new HashMap<>(); Map sinkWritersContext = new HashMap<>(); for (int i = 0; i < threads; i++) { sinkWriters.put( SinkIdentifier.of(TablePath.DEFAULT.toString(), i), new TestSinkWriter()); sinkWritersContext.put( SinkIdentifier.of(TablePath.DEFAULT.toString(), i), new TestSinkWriterContext()); } MultiTableSinkWriter multiTableSinkWriter = new MultiTableSinkWriter(sinkWriters, threads, sinkWritersContext); DefaultSerializer defaultSerializer = new DefaultSerializer<>(); for (int i = 0; i < 100; i++) { byte[] bytes = defaultSerializer.serialize(multiTableSinkWriter.prepareCommit(i).get()); defaultSerializer.deserialize(bytes); } } static class TestSinkWriter implements SinkWriter, SupportMultiTableSinkWriter { @Override public void write(SeaTunnelRow seaTunnelRow) {} @Override public Optional prepareCommit() throws IOException { return Optional.of(new TestSinkState("test")); } @Override public List snapshotState(long checkpointId) throws IOException { return SinkWriter.super.snapshotState(checkpointId); } @Override public void abortPrepare() {} @Override public void close() throws IOException {} } static class TestSinkWriterContext implements SinkWriter.Context { @Override public int getIndexOfSubtask() { return 0; } @Override public MetricsContext getMetricsContext() { return null; } @Override public EventListener getEventListener() { return new DefaultEventProcessor(); } } @Data @AllArgsConstructor static class TestSinkState implements Serializable { private String state; } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/CatalogTableTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.table.factory.TableSinkFactoryContext; import org.apache.seatunnel.api.table.factory.TableTransformFactoryContext; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import org.apache.seatunnel.common.utils.SeaTunnelException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; public class CatalogTableTest { @Test public void testCatalogTableModifyOptionsOrPartitionKeys() { CatalogTable catalogTable = CatalogTable.of( TableIdentifier.of("catalog", "database", "table"), TableSchema.builder().build(), Collections.emptyMap(), Collections.emptyList(), "comment"); catalogTable.getOptions().put("test", "value"); catalogTable.getPartitionKeys().add("test"); } @Test public void testReadCatalogTableWithUnsupportedType() { Catalog catalog = new InMemoryCatalogFactory() .createCatalog("InMemory", ReadonlyConfig.fromMap(new HashMap<>())); SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> catalog.getTables( ReadonlyConfig.fromMap( new HashMap() { { put( ConnectorCommonOptions.TABLE_NAMES .key(), Arrays.asList( "unsupported.public.table1", "unsupported.public.table2")); } }))); Assertions.assertEquals( "ErrorCode:[COMMON-21], ErrorDescription:['InMemory' tables unsupported get catalog table," + "the corresponding field types in the following tables are not supported: '{\"unsupported.public.table1\"" + ":{\"field1\":\"interval\",\"field2\":\"interval2\"},\"unsupported.public.table2\":{\"field1\":\"interval\"," + "\"field2\":\"interval2\"}}']", exception.getMessage()); Map> result = new LinkedHashMap<>(); result.put( "unsupported.public.table1", new HashMap() { { put("field1", "interval"); put("field2", "interval2"); } }); result.put( "unsupported.public.table2", new HashMap() { { put("field1", "interval"); put("field2", "interval2"); } }); Assertions.assertEquals(result, exception.getParamsValueAs("tableUnsupportedTypes")); } @Test public void testCatalogTableWithIllegalFieldNames() { CatalogTable catalogTable = CatalogTable.of( TableIdentifier.of("catalog", "database", "table"), TableSchema.builder() .column( PhysicalColumn.of( " ", BasicType.STRING_TYPE, 1L, true, null, "")) .build(), Collections.emptyMap(), Collections.emptyList(), "comment"); SeaTunnelException exception = Assertions.assertThrows( SeaTunnelException.class, () -> new TableTransformFactoryContext( Collections.singletonList(catalogTable), null, null)); SeaTunnelException exception2 = Assertions.assertThrows( SeaTunnelException.class, () -> new TableSinkFactoryContext(catalogTable, null, null)); Assertions.assertEquals( "Table database.table field name cannot be empty", exception.getMessage()); Assertions.assertEquals( "Table database.table field name cannot be empty", exception2.getMessage()); CatalogTable catalogTable2 = CatalogTable.of( TableIdentifier.of("catalog", "database", "table"), TableSchema.builder() .column( PhysicalColumn.of( "name1", BasicType.STRING_TYPE, 1L, true, null, "")) .column( PhysicalColumn.of( "name1", BasicType.STRING_TYPE, 1L, true, null, "")) .build(), Collections.emptyMap(), Collections.emptyList(), "comment"); SeaTunnelException exception3 = Assertions.assertThrows( SeaTunnelException.class, () -> new TableTransformFactoryContext( Collections.singletonList(catalogTable2), null, null)); SeaTunnelException exception4 = Assertions.assertThrows( SeaTunnelException.class, () -> new TableSinkFactoryContext(catalogTable2, null, null)); Assertions.assertEquals( "Table database.table field name1 duplicate", exception3.getMessage()); Assertions.assertEquals( "Table database.table field name1 duplicate", exception4.getMessage()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/CatalogTableUtilTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValueFactory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.options.ConnectorCommonOptions; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.DecimalType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.PrimitiveByteArrayType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.api.table.type.SqlType; import org.apache.seatunnel.common.utils.SeaTunnelException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileNotFoundException; import java.net.URISyntaxException; import java.net.URL; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import static org.apache.seatunnel.common.constants.CollectionConstants.PLUGIN_NAME; public class CatalogTableUtilTest { @Test public void testSimpleSchemaParse() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/simple.schema.conf"); Config config = ConfigFactory.parseFile(new File(path)); SeaTunnelRowType seaTunnelRowType = CatalogTableUtil.buildWithConfig(config).getSeaTunnelRowType(); Assertions.assertNotNull(seaTunnelRowType); Assertions.assertEquals(seaTunnelRowType.getFieldType(1), ArrayType.BYTE_ARRAY_TYPE); Assertions.assertEquals(seaTunnelRowType.getFieldType(2), BasicType.STRING_TYPE); Assertions.assertEquals(seaTunnelRowType.getFieldType(10), new DecimalType(30, 8)); Assertions.assertEquals(seaTunnelRowType.getFieldType(11), BasicType.VOID_TYPE); Assertions.assertEquals(seaTunnelRowType.getFieldType(12), PrimitiveByteArrayType.INSTANCE); } @Test public void testComplexSchemaParse() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/complex.schema.conf"); Config config = ConfigFactory.parseFile(new File(path)); SeaTunnelRowType seaTunnelRowType = CatalogTableUtil.buildWithConfig(config).getSeaTunnelRowType(); Assertions.assertNotNull(seaTunnelRowType); Assertions.assertEquals( seaTunnelRowType.getFieldType(0), new MapType<>( BasicType.STRING_TYPE, new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE))); Assertions.assertEquals( seaTunnelRowType.getFieldType(1), new MapType<>( BasicType.STRING_TYPE, new MapType<>(BasicType.STRING_TYPE, ArrayType.INT_ARRAY_TYPE))); Assertions.assertEquals(seaTunnelRowType.getTotalFields(), 18); Assertions.assertEquals(seaTunnelRowType.getFieldType(17).getSqlType(), SqlType.ROW); SeaTunnelRowType nestedRowFieldType = (SeaTunnelRowType) seaTunnelRowType.getFieldType(17); Assertions.assertEquals( "map", nestedRowFieldType.getFieldName(nestedRowFieldType.indexOf("map"))); Assertions.assertEquals( "row", nestedRowFieldType.getFieldName(nestedRowFieldType.indexOf("row"))); } @Test public void testSpecialSchemaParse() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/config_special_schema.conf"); Config config = ConfigFactory.parseFile(new File(path)); SeaTunnelRowType seaTunnelRowType = CatalogTableUtil.buildWithConfig(config).getSeaTunnelRowType(); Assertions.assertEquals(seaTunnelRowType.getTotalFields(), 12); Assertions.assertEquals(seaTunnelRowType.getFieldType(5).getSqlType(), SqlType.BYTES); Assertions.assertEquals(seaTunnelRowType.getFieldName(6), "t.date"); } @Test public void testCatalogUtilGetCatalogTable() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/getCatalogTable.conf"); Config config = ConfigFactory.parseFile(new File(path)); Config source = config.getConfigList("source").get(0); ReadonlyConfig sourceReadonlyConfig = ReadonlyConfig.fromConfig(source); List catalogTables = CatalogTableUtil.getCatalogTables( sourceReadonlyConfig, Thread.currentThread().getContextClassLoader()); Assertions.assertEquals(2, catalogTables.size()); Assertions.assertEquals( TableIdentifier.of("InMemory", TablePath.of("st.public.table1")), catalogTables.get(0).getTableId()); Assertions.assertEquals( TableIdentifier.of("InMemory", TablePath.of("st.public.table2")), catalogTables.get(1).getTableId()); // test empty tables Config emptyTableSource = source.withValue( ConnectorCommonOptions.TABLE_NAMES.key(), ConfigValueFactory.fromIterable(new ArrayList<>())); ReadonlyConfig emptyReadonlyConfig = ReadonlyConfig.fromConfig(emptyTableSource); Assertions.assertThrows( SeaTunnelException.class, () -> CatalogTableUtil.getCatalogTables( emptyReadonlyConfig, Thread.currentThread().getContextClassLoader())); // test unknown catalog Config cannotFindCatalogSource = source.withValue(PLUGIN_NAME, ConfigValueFactory.fromAnyRef("unknownCatalog")); ReadonlyConfig cannotFindCatalogReadonlyConfig = ReadonlyConfig.fromConfig(cannotFindCatalogSource); Assertions.assertThrows( SeaTunnelException.class, () -> CatalogTableUtil.getCatalogTables( cannotFindCatalogReadonlyConfig, Thread.currentThread().getContextClassLoader())); } @Test public void testDefaultTablePath() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/default_tablepath.conf"); Config config = ConfigFactory.parseFile(new File(path)); Config source = config.getConfigList("source").get(0); ReadonlyConfig sourceReadonlyConfig = ReadonlyConfig.fromConfig(source); CatalogTable catalogTable = CatalogTableUtil.buildWithConfig(sourceReadonlyConfig); Assertions.assertEquals( TablePath.DEFAULT.getDatabaseName(), catalogTable.getTablePath().getDatabaseName()); Assertions.assertEquals( TablePath.DEFAULT.getSchemaName(), catalogTable.getTablePath().getSchemaName()); Assertions.assertEquals( TablePath.DEFAULT.getTableName(), catalogTable.getTablePath().getTableName()); } @Test public void testGenericRowSchemaTest() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/generic_row.schema.conf"); Config config = ConfigFactory.parseFile(new File(path)); SeaTunnelRowType seaTunnelRowType = CatalogTableUtil.buildWithConfig(config).getSeaTunnelRowType(); Assertions.assertNotNull(seaTunnelRowType); Assertions.assertArrayEquals( new String[] {"map0", "map1"}, seaTunnelRowType.getFieldNames()); MapType mapType0 = (MapType) seaTunnelRowType.getFieldType(0); MapType mapType1 = (MapType) seaTunnelRowType.getFieldType(1); Assertions.assertNotNull(mapType0); Assertions.assertNotNull(mapType1); Assertions.assertEquals(BasicType.STRING_TYPE, mapType0.getKeyType()); SeaTunnelRowType expectedVal = new SeaTunnelRowType( new String[] {"c_int", "c_string", "c_row"}, new SeaTunnelDataType[] { BasicType.INT_TYPE, BasicType.STRING_TYPE, new SeaTunnelRowType( new String[] {"c_int"}, new SeaTunnelDataType[] {BasicType.INT_TYPE}) }); SeaTunnelRowType mapType0ValType = (SeaTunnelRowType) ((SeaTunnelDataType) mapType0.getValueType()); Assertions.assertEquals(expectedVal, mapType0ValType); SeaTunnelRowType mapType1ValType = (SeaTunnelRowType) ((SeaTunnelDataType) mapType1.getValueType()); Assertions.assertEquals(expectedVal, mapType1ValType); } @Test public void testPartitionKeysInSchemaConfig() throws FileNotFoundException, URISyntaxException { String path = getTestConfigFile("/conf/partition_keys.schema.conf"); Config config = ConfigFactory.parseFile(new File(path)); CatalogTable catalogTable = CatalogTableUtil.buildWithConfig(config); Assertions.assertEquals( Arrays.asList("bucket(id, 16)", "dt"), catalogTable.getPartitionKeys()); } public static String getTestConfigFile(String configFile) throws FileNotFoundException, URISyntaxException { URL resource = CatalogTableUtilTest.class.getResource(configFile); if (resource == null) { throw new FileNotFoundException("Can't find config file: " + configFile); } return Paths.get(resource.toURI()).toString(); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/InMemoryCatalog.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.shade.com.google.common.collect.Lists; import org.apache.seatunnel.shade.org.apache.commons.lang3.tuple.Pair; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.exception.CatalogException; import org.apache.seatunnel.api.table.catalog.exception.DatabaseAlreadyExistException; import org.apache.seatunnel.api.table.catalog.exception.DatabaseNotExistException; import org.apache.seatunnel.api.table.catalog.exception.TableAlreadyExistException; import org.apache.seatunnel.api.table.catalog.exception.TableNotExistException; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.LocalTimeType; import org.apache.seatunnel.common.exception.CommonError; import lombok.Getter; import lombok.extern.slf4j.Slf4j; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @Slf4j public class InMemoryCatalog implements Catalog { private final ReadonlyConfig options; private final String name; // database -> tables private final Map> catalogTables; private static final String DEFAULT_DATABASE = "default"; private static final String UNSUPPORTED_DATABASE = "unsupported"; @Getter private boolean isRunTruncateTable = false; InMemoryCatalog(String catalogName, ReadonlyConfig options) { this.name = catalogName; this.options = options; this.catalogTables = new HashMap<>(); addDefaultTable(); } // Add some default table for testing private void addDefaultTable() { this.catalogTables.put(DEFAULT_DATABASE, new ArrayList<>()); this.catalogTables.put(UNSUPPORTED_DATABASE, new ArrayList<>()); List tables = new ArrayList<>(); this.catalogTables.put("st", tables); TableSchema tableSchema = TableSchema.builder() .column(PhysicalColumn.of("id", BasicType.LONG_TYPE, 22, false, null, "id")) .column( PhysicalColumn.of( "name", BasicType.STRING_TYPE, 128, false, null, "name")) .column( PhysicalColumn.of( "age", BasicType.INT_TYPE, (Long) null, true, null, "age")) .column( PhysicalColumn.of( "createTime", LocalTimeType.LOCAL_DATE_TIME_TYPE, 3, true, null, "createTime")) .column( PhysicalColumn.of( "lastUpdateTime", LocalTimeType.LOCAL_DATE_TIME_TYPE, 3, true, null, "lastUpdateTime")) .primaryKey(PrimaryKey.of("id", Lists.newArrayList("id"))) .constraintKey( ConstraintKey.of( ConstraintKey.ConstraintType.INDEX_KEY, "name", Lists.newArrayList( ConstraintKey.ConstraintKeyColumn.of( "name", null)))) .build(); CatalogTable catalogTable1 = CatalogTable.of( TableIdentifier.of(name, TablePath.of("st", "public", "table1")), TableSchema.builder().build(), new HashMap<>(), new ArrayList<>(), "In Memory Table"); CatalogTable catalogTable2 = CatalogTable.of( TableIdentifier.of(name, TablePath.of("st", "public", "table2")), TableSchema.builder().build(), new HashMap<>(), new ArrayList<>(), "In Memory Table", name); tables.add(catalogTable1); tables.add(catalogTable2); CatalogTable unsupportedTable1 = CatalogTable.of( TableIdentifier.of( name, TablePath.of(UNSUPPORTED_DATABASE, "public", "table1")), tableSchema, new HashMap<>(), new ArrayList<>(), "In Memory Table"); CatalogTable unsupportedTable2 = CatalogTable.of( TableIdentifier.of( name, TablePath.of(UNSUPPORTED_DATABASE, "public", "table2")), tableSchema, new HashMap<>(), new ArrayList<>(), "In Memory Table", name); this.catalogTables.get(UNSUPPORTED_DATABASE).add(unsupportedTable1); this.catalogTables.get(UNSUPPORTED_DATABASE).add(unsupportedTable2); } @Override public void open() throws CatalogException { String username = options.get(InMemoryCatalogOptionRule.username); String password = options.get(InMemoryCatalogOptionRule.password); String host = options.get(InMemoryCatalogOptionRule.host); int port = options.get(InMemoryCatalogOptionRule.port); log.trace( String.format( "InMemoryCatalog %s opening with %s/%s in %s:%s", name, username, password, host, port)); } @Override public void close() throws CatalogException { log.trace(String.format("InMemoryCatalog %s closing", name)); } @Override public String name() { return "InMemory"; } @Override public String getDefaultDatabase() throws CatalogException { return DEFAULT_DATABASE; } @Override public void truncateTable(TablePath tablePath, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException { isRunTruncateTable = true; } @Override public boolean databaseExists(String databaseName) throws CatalogException { return catalogTables.containsKey(databaseName); } @Override public List listDatabases() throws CatalogException { return new ArrayList<>(catalogTables.keySet()); } @Override public List listTables(String databaseName) throws CatalogException, DatabaseNotExistException { return catalogTables.get(databaseName).stream() .map( table -> table.getTableId().getSchemaName() + "." + table.getTableId().getTableName()) .collect(Collectors.toList()); } @Override public boolean tableExists(TablePath tablePath) throws CatalogException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { List tables = catalogTables.get(tablePath.getDatabaseName()); return tables.stream().anyMatch(t -> t.getTableId().toTablePath().equals(tablePath)); } return false; } @Override public CatalogTable getTable(TablePath tablePath) throws CatalogException, TableNotExistException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { if (tablePath.getDatabaseName().equals(UNSUPPORTED_DATABASE)) { List> unsupportedFields = Arrays.asList( Pair.of("field1", "interval"), Pair.of("field2", "interval2")); buildColumnsWithErrorCheck( tablePath, new TableSchema.Builder(), unsupportedFields.iterator(), field -> { throw CommonError.convertToSeaTunnelTypeError( name(), field.getValue(), field.getKey()); }); } List tables = catalogTables.get(tablePath.getDatabaseName()); return tables.stream() .filter(t -> t.getTableId().toTablePath().equals(tablePath)) .findFirst() .orElseThrow(() -> new TableNotExistException(name, tablePath)); } else { throw new TableNotExistException(name, tablePath); } } @Override public void createTable(TablePath tablePath, CatalogTable table, boolean ignoreIfExists) throws TableAlreadyExistException, DatabaseNotExistException, CatalogException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { List tables = catalogTables.get(tablePath.getDatabaseName()); if (tables.stream().anyMatch(t -> t.getTableId().toTablePath().equals(tablePath))) { if (ignoreIfExists) { log.debug("Table {} already exists, ignore", tablePath.getFullName()); } else { throw new TableAlreadyExistException(name, tablePath); } } else { tables.add(table); } } else { throw new DatabaseNotExistException(name, tablePath.getDatabaseName()); } } @Override public void dropTable(TablePath tablePath, boolean ignoreIfNotExists) throws TableNotExistException, CatalogException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { List tables = catalogTables.get(tablePath.getDatabaseName()); if (tables.stream().anyMatch(t -> t.getTableId().toTablePath().equals(tablePath))) { tables.removeIf(t -> t.getTableId().toTablePath().equals(tablePath)); } else { if (ignoreIfNotExists) { log.debug("Table {} not exists, ignore", tablePath.getFullName()); } else { throw new TableNotExistException(name, tablePath); } } } else { throw new DatabaseNotExistException(name, tablePath.getDatabaseName()); } } @Override public void createDatabase(TablePath tablePath, boolean ignoreIfExists) throws DatabaseAlreadyExistException, CatalogException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { if (ignoreIfExists) { log.debug("Database {} already exists, ignore", tablePath.getDatabaseName()); } else { throw new DatabaseAlreadyExistException(name, tablePath.getDatabaseName()); } } else { catalogTables.put(tablePath.getDatabaseName(), new ArrayList<>()); } } @Override public void dropDatabase(TablePath tablePath, boolean ignoreIfNotExists) throws DatabaseNotExistException, CatalogException { if (catalogTables.containsKey(tablePath.getDatabaseName())) { catalogTables.remove(tablePath.getDatabaseName()); } else { if (ignoreIfNotExists) { log.debug("Database {} not exists, ignore", tablePath.getDatabaseName()); } else { throw new DatabaseNotExistException(name, tablePath.getDatabaseName()); } } } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/InMemoryCatalogFactory.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.configuration.util.OptionRule; import org.apache.seatunnel.api.table.factory.CatalogFactory; import org.apache.seatunnel.api.table.factory.Factory; import com.google.auto.service.AutoService; @AutoService(Factory.class) public class InMemoryCatalogFactory implements CatalogFactory { @Override public Catalog createCatalog(String catalogName, ReadonlyConfig options) { return new InMemoryCatalog(catalogName, options); } @Override public String factoryIdentifier() { return "InMemory"; } @Override public OptionRule optionRule() { return OptionRule.builder() .required(InMemoryCatalogOptionRule.username, InMemoryCatalogOptionRule.password) .optional(InMemoryCatalogOptionRule.host, InMemoryCatalogOptionRule.port) .build(); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/InMemoryCatalogOptionRule.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; public class InMemoryCatalogOptionRule { public static final Option username = Options.key("username").stringType().noDefaultValue().withDescription("username"); public static final Option password = Options.key("password").stringType().noDefaultValue().withDescription("password"); public static final Option host = Options.key("host").stringType().defaultValue("localhost").withDescription("host"); public static final Option port = Options.key("port").intType().defaultValue(5081).withDescription("port"); } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/SeaTunnelDataTypeConvertorUtilTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog; import org.apache.seatunnel.api.table.type.ArrayType; import org.apache.seatunnel.api.table.type.BasicType; import org.apache.seatunnel.api.table.type.MapType; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class SeaTunnelDataTypeConvertorUtilTest { @Test void testParseWithUnsupportedType() { SeaTunnelRuntimeException exception = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", "MULTIPLE_ROW")); Assertions.assertEquals( "ErrorCode:[COMMON-07], ErrorDescription:['SeaTunnel' unsupported data type 'MULTIPLE_ROW' of 'test']", exception.getMessage()); SeaTunnelRuntimeException exception2 = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", "map")); Assertions.assertEquals( "ErrorCode:[COMMON-07], ErrorDescription:['SeaTunnel' unsupported data type 'MULTIPLE_ROW' of 'test']", exception2.getMessage()); SeaTunnelRuntimeException exception3 = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", "array")); Assertions.assertEquals( "ErrorCode:[COMMON-07], ErrorDescription:['SeaTunnel' unsupported data type 'MULTIPLE_ROW' of 'test']", exception3.getMessage()); SeaTunnelRuntimeException exception4 = Assertions.assertThrows( SeaTunnelRuntimeException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", "uuid")); Assertions.assertEquals( "ErrorCode:[COMMON-07], ErrorDescription:['SeaTunnel' unsupported data type 'uuid' of 'test']", exception4.getMessage()); IllegalArgumentException exception5 = Assertions.assertThrows( IllegalArgumentException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", "{uuid}")); String expectedMsg5 = String.format("HOCON Config parse from %s failed.", "{conf = {uuid}}"); Assertions.assertEquals(expectedMsg5, exception5.getMessage()); String invalidTypeDeclaration = "[e]"; IllegalArgumentException exception6 = Assertions.assertThrows( IllegalArgumentException.class, () -> SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "test", String.format("{c_0 = %s}", invalidTypeDeclaration))); String expectedMsg6 = String.format( "Unsupported parse SeaTunnel Type from '%s'.", invalidTypeDeclaration); Assertions.assertEquals(expectedMsg6, exception6.getMessage()); } @Test public void testCompatibleTypeDeclare() { SeaTunnelDataType longType = SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType("c_long", "long"); Assertions.assertEquals(BasicType.LONG_TYPE, longType); SeaTunnelDataType shortType = SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType("c_short", "short"); Assertions.assertEquals(BasicType.SHORT_TYPE, shortType); SeaTunnelDataType byteType = SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType("c_byte", "byte"); Assertions.assertEquals(BasicType.BYTE_TYPE, byteType); ArrayType longArrayType = (ArrayType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_long_array", "array"); Assertions.assertEquals(ArrayType.LONG_ARRAY_TYPE, longArrayType); ArrayType shortArrayType = (ArrayType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_short_array", "array"); Assertions.assertEquals(ArrayType.SHORT_ARRAY_TYPE, shortArrayType); ArrayType byteArrayType = (ArrayType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_byte_array", "array"); Assertions.assertEquals(ArrayType.BYTE_ARRAY_TYPE, byteArrayType); MapType longMapType = (MapType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_long_map", "map"); Assertions.assertEquals(BasicType.LONG_TYPE, longMapType.getKeyType()); Assertions.assertEquals(BasicType.LONG_TYPE, longMapType.getValueType()); MapType shortMapType = (MapType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_short_map", "map"); Assertions.assertEquals(BasicType.SHORT_TYPE, shortMapType.getKeyType()); Assertions.assertEquals(BasicType.SHORT_TYPE, shortMapType.getValueType()); MapType byteMapType = (MapType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_byte_map", "map"); Assertions.assertEquals(BasicType.BYTE_TYPE, byteMapType.getKeyType()); Assertions.assertEquals(BasicType.BYTE_TYPE, byteMapType.getValueType()); SeaTunnelRowType longRow = (SeaTunnelRowType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_long_row", "{c = long}"); Assertions.assertEquals(BasicType.LONG_TYPE, longRow.getFieldType(0)); SeaTunnelRowType shortRow = (SeaTunnelRowType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_short_row", "{c = short}"); Assertions.assertEquals(BasicType.SHORT_TYPE, shortRow.getFieldType(0)); SeaTunnelRowType byteRow = (SeaTunnelRowType) SeaTunnelDataTypeConvertorUtil.deserializeSeaTunnelDataType( "c_byte_row", "{c = byte}"); Assertions.assertEquals(BasicType.BYTE_TYPE, byteRow.getFieldType(0)); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/schema/BaseConfigParserTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.schema; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.CatalogTableUtilTest; import java.io.File; import java.io.FileNotFoundException; import java.net.URISyntaxException; import java.net.URL; import java.nio.file.Paths; public class BaseConfigParserTest { protected Config getConfig(String configFile) throws FileNotFoundException, URISyntaxException { return ConfigFactory.parseFile(new File(getTestConfigFile(configFile))); } protected ReadonlyConfig getReadonlyConfig(String configFile) throws FileNotFoundException, URISyntaxException { return ReadonlyConfig.fromConfig(getConfig(configFile)); } private String getTestConfigFile(String configFile) throws FileNotFoundException, URISyntaxException { URL resource = CatalogTableUtilTest.class.getResource(configFile); if (resource == null) { throw new FileNotFoundException("Can't find config file: " + configFile); } return Paths.get(resource.toURI()).toString(); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/catalog/schema/ReadonlyConfigParserTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.catalog.schema; import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.table.catalog.Column; import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableSchema; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.api.table.type.SqlType; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.io.FileNotFoundException; import java.net.URISyntaxException; import java.util.List; class ReadonlyConfigParserTest extends BaseConfigParserTest { private static final String COLUMN_CONFIG = "/conf/catalog/schema_column.conf"; private static final String FIELD_CONFIG = "/conf/catalog/schema_field.conf"; @Test void parseColumn() throws FileNotFoundException, URISyntaxException { ReadonlyConfig config = getReadonlyConfig(COLUMN_CONFIG); ReadonlyConfigParser readonlyConfigParser = new ReadonlyConfigParser(); TableSchema tableSchema = readonlyConfigParser.parse(config); assertPrimaryKey(tableSchema); assertConstraintKey(tableSchema); assertColumn(tableSchema, true); } @Test void parseField() throws FileNotFoundException, URISyntaxException { ReadonlyConfig config = getReadonlyConfig(FIELD_CONFIG); ReadonlyConfigParser readonlyConfigParser = new ReadonlyConfigParser(); TableSchema tableSchema = readonlyConfigParser.parse(config); assertPrimaryKey(tableSchema); assertConstraintKey(tableSchema); assertColumn(tableSchema, false); } private void assertPrimaryKey(TableSchema tableSchema) { PrimaryKey primaryKey = tableSchema.getPrimaryKey(); Assertions.assertEquals("id", primaryKey.getPrimaryKey()); Assertions.assertEquals("id", primaryKey.getColumnNames().get(0)); } private void assertConstraintKey(TableSchema tableSchema) { List constraintKeys = tableSchema.getConstraintKeys(); ConstraintKey constraintKey = constraintKeys.get(0); Assertions.assertEquals("id_index", constraintKey.getConstraintName()); Assertions.assertEquals( ConstraintKey.ConstraintType.INDEX_KEY, constraintKey.getConstraintType()); Assertions.assertEquals("id", constraintKey.getColumnNames().get(0).getColumnName()); Assertions.assertEquals( ConstraintKey.ColumnSortType.ASC, constraintKey.getColumnNames().get(0).getSortType()); } private void assertColumn(TableSchema tableSchema, boolean comeFromColumnConfig) { List columns = tableSchema.getColumns(); Assertions.assertEquals(20, columns.size()); Assertions.assertEquals("id", columns.get(0).getName()); Assertions.assertEquals("map", columns.get(1).getName()); Assertions.assertEquals( "map>", columns.get(1).getDataType().toString().toLowerCase()); Assertions.assertEquals("map_array", columns.get(2).getName()); Assertions.assertEquals( "map>>", columns.get(2).getDataType().toString().toLowerCase()); Assertions.assertEquals("array", columns.get(3).getName()); Assertions.assertEquals( "array", columns.get(3).getDataType().toString().toLowerCase()); Assertions.assertEquals("string", columns.get(4).getName()); Assertions.assertEquals("string", columns.get(4).getDataType().toString().toLowerCase()); Assertions.assertEquals("row", columns.get(18).getName()); Assertions.assertEquals(SqlType.ROW, columns.get(18).getDataType().getSqlType()); SeaTunnelRowType seaTunnelRowType = (SeaTunnelRowType) columns.get(18).getDataType(); Assertions.assertEquals(18, seaTunnelRowType.getTotalFields()); SeaTunnelRowType seatunnalRowType1 = (SeaTunnelRowType) seaTunnelRowType.getFieldType(17); Assertions.assertEquals(17, seatunnalRowType1.getTotalFields()); Assertions.assertEquals("source", columns.get(19).getName()); Assertions.assertEquals(SqlType.ROW, columns.get(19).getDataType().getSqlType()); SeaTunnelRowType seaTunnelRowType2 = (SeaTunnelRowType) columns.get(19).getDataType(); Assertions.assertEquals(3, seaTunnelRowType2.getTotalFields()); Assertions.assertEquals("source", seaTunnelRowType2.getFieldName(2)); Assertions.assertEquals(SqlType.ROW, seaTunnelRowType2.getFieldType(2).getSqlType()); if (comeFromColumnConfig) { Assertions.assertEquals(0, columns.get(0).getDefaultValue()); Assertions.assertEquals("I'm default value", columns.get(4).getDefaultValue()); Assertions.assertEquals(false, columns.get(5).getDefaultValue()); Assertions.assertEquals(1.1, columns.get(10).getDefaultValue()); Assertions.assertEquals("2020-01-01", columns.get(15).getDefaultValue()); Assertions.assertEquals(4294967295L, columns.get(4).getColumnLength()); } } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/schema/event/EventTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.schema.event; import org.apache.seatunnel.api.event.EventType; import org.apache.seatunnel.api.table.catalog.PhysicalColumn; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.api.table.type.BasicType; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; public class EventTest { @Test public void testTableColumnEventInstanceOf() { AlterTableModifyColumnEvent modifyColumnEvent = AlterTableModifyColumnEvent.modify( TableIdentifier.of("", TablePath.DEFAULT), PhysicalColumn.builder() .name("test") .dataType(BasicType.STRING_TYPE) .build()); Assertions.assertEquals( EventType.SCHEMA_CHANGE_MODIFY_COLUMN, getEventType(modifyColumnEvent)); AlterTableChangeColumnEvent changeColumnEvent = AlterTableChangeColumnEvent.change( TableIdentifier.of("", TablePath.DEFAULT), "old", PhysicalColumn.builder() .name("test") .dataType(BasicType.STRING_TYPE) .build()); Assertions.assertEquals( EventType.SCHEMA_CHANGE_CHANGE_COLUMN, getEventType(changeColumnEvent)); AlterTableAddColumnEvent addColumnEvent = AlterTableAddColumnEvent.add( TableIdentifier.of("", TablePath.DEFAULT), PhysicalColumn.builder() .name("test") .dataType(BasicType.STRING_TYPE) .build()); Assertions.assertEquals(EventType.SCHEMA_CHANGE_ADD_COLUMN, getEventType(addColumnEvent)); AlterTableDropColumnEvent dropColumnEvent = new AlterTableDropColumnEvent(TableIdentifier.of("", TablePath.DEFAULT), "test"); Assertions.assertEquals(EventType.SCHEMA_CHANGE_DROP_COLUMN, getEventType(dropColumnEvent)); } private EventType getEventType(AlterTableColumnEvent event) { if (event instanceof AlterTableAddColumnEvent) { return EventType.SCHEMA_CHANGE_ADD_COLUMN; } else if (event instanceof AlterTableDropColumnEvent) { return EventType.SCHEMA_CHANGE_DROP_COLUMN; } else if (event instanceof AlterTableModifyColumnEvent) { return EventType.SCHEMA_CHANGE_MODIFY_COLUMN; } else if (event instanceof AlterTableChangeColumnEvent) { return EventType.SCHEMA_CHANGE_CHANGE_COLUMN; } throw new UnsupportedOperationException( "Unsupported event type: " + event.getClass().getName()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/table/type/SeaTunnelRowTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.table.type; import org.apache.seatunnel.shade.com.google.common.collect.Maps; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import java.math.BigDecimal; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; public class SeaTunnelRowTest { @Test void testForRowSize() { Map map = new HashMap<>(); map.put( "key1", new SeaTunnelRow( new Object[] { 1, "test", 1L, new BigDecimal("3333.333"), })); map.put( "key2", new SeaTunnelRow( new Object[] { 1, "test", 1L, new BigDecimal("3333.333"), })); Map objectMap = Maps.newHashMap(); objectMap.put("name", "cosmos"); SeaTunnelRow row = new SeaTunnelRow( new Object[] { 1, "test", 1L, map, new BigDecimal("3333.333"), new String[] {"test2", "test", "3333.333"}, new Integer[] {1, 2, 3}, new Long[] {1L, 2L, 3L}, new Double[] {1D, 2D}, new Float[] {1F, 2F}, new Boolean[] {Boolean.TRUE, Boolean.FALSE}, new Byte[] {1, 2, 3, 4}, new Short[] {Short.parseShort("1")}, new Map[] {objectMap} }); SeaTunnelRow row2 = new SeaTunnelRow( new Object[] { 1, "test", 1L, map, new BigDecimal("3333.333"), new String[] {"test2", "test", "3333.333", null}, new Integer[] {1, 2, 3, null}, new Long[] {1L, 2L, 3L, null}, new Double[] {1D, 2D, null}, new Float[] {1F, 2F, null}, new Boolean[] {Boolean.TRUE, Boolean.FALSE, null}, new Byte[] {1, 2, 3, 4, null}, new Short[] {Short.parseShort("1"), null}, new Map[] {objectMap} }); SeaTunnelRowType rowType = new SeaTunnelRowType( new String[] { "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13" }, new SeaTunnelDataType[] { BasicType.INT_TYPE, BasicType.STRING_TYPE, BasicType.LONG_TYPE, new MapType<>( BasicType.STRING_TYPE, new SeaTunnelRowType( new String[] {"f0", "f1", "f2", "f3"}, new SeaTunnelDataType[] { BasicType.INT_TYPE, BasicType.STRING_TYPE, BasicType.LONG_TYPE, new DecimalType(10, 3) })), new DecimalType(10, 3), ArrayType.STRING_ARRAY_TYPE, ArrayType.INT_ARRAY_TYPE, ArrayType.LONG_ARRAY_TYPE, ArrayType.DOUBLE_ARRAY_TYPE, ArrayType.FLOAT_ARRAY_TYPE, ArrayType.BOOLEAN_ARRAY_TYPE, ArrayType.BYTE_ARRAY_TYPE, ArrayType.SHORT_ARRAY_TYPE, new ArrayType<>( Map[].class, new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE)) }); Assertions.assertEquals(259, row.getBytesSize(rowType)); Assertions.assertEquals(259, row.getBytesSize()); Assertions.assertEquals(259, row2.getBytesSize(rowType)); Assertions.assertEquals(259, row2.getBytesSize()); } @Test void testWithLinkHashMap() { Map map = new LinkedHashMap<>(); map.put("key", "value"); SeaTunnelRow row = new SeaTunnelRow(new Object[] {map}); Assertions.assertEquals(8, row.getBytesSize()); } @Test void testWithMapInterface() { Map map = Collections.singletonMap("key", "value"); SeaTunnelRow row = new SeaTunnelRow(new Object[] {map}); Assertions.assertEquals(8, row.getBytesSize()); } } ================================================ FILE: seatunnel-api/src/test/java/org/apache/seatunnel/api/tracing/MDCTracerTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.tracing; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.slf4j.MDC; import java.util.concurrent.Callable; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; public class MDCTracerTest { @Test public void testMDCTracedRunnable() { MDCContext mdcContext = MDCContext.of(1, 2, 3); Runnable tracedRunnable = MDCTracer.tracing( mdcContext, new Runnable() { @Override public void run() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } }); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedRunnable.run(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } @Test public void testMDCTracedCallable() throws Exception { MDCContext mdcContext = MDCContext.of(1, 2, 3); Callable tracedCallable = MDCTracer.tracing( mdcContext, new Callable() { @Override public Void call() throws Exception { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return null; } }); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedCallable.call(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } @Test public void testMDCTracedSupplier() throws Exception { Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); try (MDCContext ignored = MDCContext.of(1, 2, 3).activate()) { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); CompletableFuture.supplyAsync( MDCTracer.tracing( new Supplier() { @Override public Object get() { Assertions.assertEquals( "1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals( "2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals( "3", MDC.get(MDCContext.TASK_ID)); return null; } })) .get(); Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } @Test public void testMDCTracedExecutorService() throws Exception { MDCContext mdcContext = MDCContext.of(1, 2, 3); MDCExecutorService tracedExecutorService = MDCTracer.tracing(mdcContext, Executors.newSingleThreadExecutor()); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedExecutorService .submit( new Runnable() { @Override public void run() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } }) .get(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedExecutorService .submit( new Callable() { @Override public Void call() throws Exception { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return null; } }) .get(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); MDCScheduledExecutorService tracedScheduledExecutorService = MDCTracer.tracing(mdcContext, Executors.newSingleThreadScheduledExecutor()); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedScheduledExecutorService .schedule( new Runnable() { @Override public void run() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } }, 1, TimeUnit.SECONDS) .get(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); tracedScheduledExecutorService .schedule( new Callable() { @Override public Object call() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return null; } }, 1, TimeUnit.SECONDS) .get(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); CompletableFuture futureWithScheduleAtFixedRate = new CompletableFuture<>(); tracedScheduledExecutorService.scheduleAtFixedRate( new Runnable() { AtomicInteger executeCount = new AtomicInteger(0); @Override public void run() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); executeCount.incrementAndGet(); if (executeCount.get() > 10 && !futureWithScheduleAtFixedRate.isDone()) { futureWithScheduleAtFixedRate.complete(true); } } }, 0, 10, TimeUnit.MILLISECONDS); futureWithScheduleAtFixedRate.join(); CompletableFuture futureWithScheduleAtFixedDelay = new CompletableFuture<>(); tracedScheduledExecutorService.scheduleWithFixedDelay( new Runnable() { AtomicInteger executeCount = new AtomicInteger(0); @Override public void run() { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); executeCount.incrementAndGet(); if (executeCount.get() > 10 && !futureWithScheduleAtFixedDelay.isDone()) { futureWithScheduleAtFixedDelay.complete(true); } } }, 0, 10, TimeUnit.MILLISECONDS); futureWithScheduleAtFixedDelay.join(); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } @Test public void testMDCTracedStream() throws Exception { MDCContext mdcContext = MDCContext.of(1, 2, 3); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); MDCTracer.tracing( mdcContext, IntStream.range(1, 100) .boxed() .collect(Collectors.toList()) .parallelStream()) .filter( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return true; }) .map( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return integer; }) .sorted( (o1, o2) -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return Integer.compare(o1, o2); }) .forEach( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); }); Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); try (MDCContext ignored = MDCContext.of(1, 2, 3).activate()) { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); MDCTracer.tracing( IntStream.range(1, 100) .boxed() .collect(Collectors.toList()) .parallelStream()) .filter( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return true; }) .map( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return integer; }) .sorted( (o1, o2) -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); return Integer.compare(o1, o2); }) .forEach( integer -> { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); }); Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); try (MDCContext ignored = MDCContext.of(1, 2, 3).activate()) { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); mdcContext = MDCContext.of(4, 5, 6); MDCTracer.tracing( mdcContext, IntStream.range(1, 100) .boxed() .collect(Collectors.toList()) .parallelStream()) .filter( integer -> { Assertions.assertEquals("4", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("5", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("6", MDC.get(MDCContext.TASK_ID)); return true; }) .map( integer -> { Assertions.assertEquals("4", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("5", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("6", MDC.get(MDCContext.TASK_ID)); return integer; }) .sorted( (o1, o2) -> { Assertions.assertEquals("4", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("5", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("6", MDC.get(MDCContext.TASK_ID)); return Integer.compare(o1, o2); }) .forEach( integer -> { Assertions.assertEquals("4", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("5", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("6", MDC.get(MDCContext.TASK_ID)); }); Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); } Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } @Test public void testMDCContext() throws Exception { Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); MDCContext mdcContext = MDCContext.of(1, 2, 3); try (MDCContext ignored = mdcContext.activate()) { Assertions.assertEquals("1", MDC.get(MDCContext.JOB_ID)); Assertions.assertEquals("2", MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertEquals("3", MDC.get(MDCContext.TASK_ID)); MDCContext currentMDCCOntext = MDCContext.current(); Assertions.assertEquals(mdcContext, currentMDCCOntext); } Assertions.assertNull(MDC.get(MDCContext.JOB_ID)); Assertions.assertNull(MDC.get(MDCContext.PIPELINE_ID)); Assertions.assertNull(MDC.get(MDCContext.TASK_ID)); } } ================================================ FILE: seatunnel-api/src/test/resources/conf/catalog/schema_column.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema = { columns = [ { name = id type = bigint nullable = false defaultValue = 0 comment = "primary key id" }, { name = map type = "map>" nullable = true comment = "map value" }, { name = map_array type = "map>>" nullable = true comment = "map_array value" }, { name = array type = "array" nullable = true comment = "array value" }, { name = string type = "string" nullable = true defaultValue = "I'm default value" // bigger than integer max value columnLength = 4294967295 comment = "string value" }, { name = boolean type = "boolean" nullable = true defaultValue = false comment = "boolean value" }, { name = tinyint type = "tinyint" nullable = true comment = "tinyint value" }, { name = smallint type = "smallint" nullable = true comment = "smallint value" }, { name = int type = "int" nullable = true comment = "int value" }, { name = bigint type = "bigint" nullable = true comment = "bigint value" }, { name = float type = "float" nullable = true defaultValue = 1.1 comment = "float value" }, { name = double type = "double" nullable = true comment = "double value" }, { name = decimal type = "decimal(30, 8)" nullable = true comment = "decimal value" }, { name = "null" type = "null" nullable = true comment = "null value" }, { name = bytes type = "bytes" nullable = true comment = "bytes value" }, { name = date type = "date" nullable = true defaultValue = "2020-01-01" comment = "date value" }, { name = time type = "time" nullable = true comment = "time value" }, { name = timestamp type = "timestamp" nullable = true comment = "timestamp value" }, { name = row type = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp row = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp } } nullable = true comment = "row value" }, { name = source type = { map = "map>" string = string source = { map = "map>" string = string } } nullable = true comment = "row value" } ] primaryKey { name = "id" columnNames = [id] } constraintKeys = [ { constraintName = "id_index" constraintType = INDEX_KEY constraintColumns = [ { columnName = "id" sortType = ASC } ] }, ] } ================================================ FILE: seatunnel-api/src/test/resources/conf/catalog/schema_field.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { fields { id = int map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp row = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp row = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp } } source = { map = "map>" string = string source = { map = "map>" string = string } } } primaryKey { name = "id" columnNames = [id] } constraintKeys = [ { constraintName = "id_index" constraintType = INDEX_KEY constraintColumns = [ { columnName = "id" sortType = ASC } ] } ] } ================================================ FILE: seatunnel-api/src/test/resources/conf/complex.schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { fields { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp row = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp row = { map = "map>" map_array = "map>>" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp } } } } ================================================ FILE: seatunnel-api/src/test/resources/conf/config_special_schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # // Special schema, used X.X as key. we shouldn't parse it as object of t. schema { fields { t.string = STRING t.boolean = BOOLEAN t.long = BIGINT t.double = DOUBLE t.null = NULL t.byteArray = BYTES t.date = DATE t.localDateTime = TIMESTAMP _map = "MAP" t.list = "ARRAY" t.int = INT t.float = FLOAT } } ================================================ FILE: seatunnel-api/src/test/resources/conf/default_tablepath.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # env { job.mode = "BATCH" } source { MongoDB-CDC { hosts = "mongo0:27017" database = ["inventory"] collection = ["inventory.products"] username = superuser password = superpw schema = { fields { "_id": string, "name": string, "description": string, "weight": string } } } } transform { } sink { Console{} } ================================================ FILE: seatunnel-api/src/test/resources/conf/generic_row.schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { fields { # Hocon style declare row type in generic type map0 = "map" # Json style declare row type in generic type map1 = "map" } } ================================================ FILE: seatunnel-api/src/test/resources/conf/getCatalogTable.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # env { job.mode = "BATCH" } source { InMemory { plugin_output = "fake" username = "st" password = "stpassword" table-names = ["st.public.table1", "st.public.table2"] parallelism = 3 } } transform { } sink { InMemory { plugin_input = "fake" username = "st" password = "stpassword" address = "localhost" port = 1234 } } ================================================ FILE: seatunnel-api/src/test/resources/conf/json/metadata_json_from_meta_lake_hive.json ================================================ { "code": 0, "table": { "name": "all_hive_types_csv", "columns": [ { "name": "c_tinyint", "type": "byte", "nullable": true, "autoIncrement": false }, { "name": "c_smallint", "type": "short", "nullable": true, "autoIncrement": false }, { "name": "c_int", "type": "integer", "nullable": true, "autoIncrement": false }, { "name": "c_bigint", "type": "long", "nullable": true, "autoIncrement": false }, { "name": "c_boolean", "type": "boolean", "nullable": true, "autoIncrement": false }, { "name": "c_float", "type": "float", "nullable": true, "autoIncrement": false }, { "name": "c_double", "type": "double", "nullable": true, "autoIncrement": false }, { "name": "c_decimal", "type": "decimal(20,6)", "nullable": true, "autoIncrement": false }, { "name": "c_string", "type": "string", "nullable": true, "autoIncrement": false }, { "name": "c_varchar", "type": "varchar(50)", "nullable": true, "autoIncrement": false }, { "name": "c_char", "type": "char(10)", "nullable": true, "autoIncrement": false }, { "name": "c_binary", "type": "binary", "nullable": true, "autoIncrement": false }, { "name": "c_date", "type": "date", "nullable": true, "autoIncrement": false }, { "name": "c_timestamp", "type": "timestamp", "nullable": true, "autoIncrement": false }, { "name": "c_array_int", "type": { "type": "list", "containsNull": true, "elementType": "integer" }, "nullable": true, "autoIncrement": false }, { "name": "c_array_string", "type": { "type": "list", "containsNull": true, "elementType": "string" }, "nullable": true, "autoIncrement": false }, { "name": "c_map_str_int", "type": { "type": "map", "valueContainsNull": true, "keyType": "string", "valueType": "integer" }, "nullable": true, "autoIncrement": false }, { "name": "c_map_str_str", "type": { "type": "map", "valueContainsNull": true, "keyType": "string", "valueType": "string" }, "nullable": true, "autoIncrement": false }, { "name": "c_struct_simple", "type": { "type": "struct", "fields": [ { "name": "id", "type": "integer", "nullable": true }, { "name": "name", "type": "string", "nullable": true } ] }, "nullable": true, "autoIncrement": false }, { "name": "c_struct_nested", "type": { "type": "struct", "fields": [ { "name": "base", "type": { "type": "struct", "fields": [ { "name": "id", "type": "long", "nullable": true }, { "name": "flag", "type": "boolean", "nullable": true } ] }, "nullable": true }, { "name": "ext", "type": { "type": "struct", "fields": [ { "name": "score", "type": "double", "nullable": true }, { "name": "tags", "type": { "type": "list", "containsNull": true, "elementType": "string" }, "nullable": true } ] }, "nullable": true } ] }, "nullable": true, "autoIncrement": false } ], "properties": { "numRows": "0", "rawDataSize": "0", "transient_lastDdlTime": "1769685048", "serde.parameter.mapkey.delim": ":", "output-format": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "table-type": "MANAGED_TABLE", "serde.parameter.collection.delim": "|", "numFilesErasureCoded": "0", "input-format": "org.apache.hadoop.mapred.TextInputFormat", "totalSize": "0", "COLUMN_STATS_ACCURATE": "{\"BASIC_STATS\":\"true\"}", "numFiles": "0", "serde.parameter.serialization.format": ",", "serde.parameter.field.delim": ",", "location": "hdfs://foton1.cdh.com:8020/user/hive/warehouse/test.db/all_hive_types_csv", "serde-lib": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe" }, "audit": { "creator": "root", "createTime": "2026-01-29T11:10:48Z" }, "distribution": { "strategy": "none", "number": 0, "funcArgs": [] }, "sortOrders": [], "partitioning": [], "indexes": [] } } ================================================ FILE: seatunnel-api/src/test/resources/conf/json/metadata_json_from_meta_lake_pgsql.json ================================================ { "code": 0, "table": { "name": "all_type", "columns": [ { "name": "id", "type": "integer", "nullable": false, "autoIncrement": false }, { "name": "big_number", "type": "long", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "small_number", "type": "integer", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "tiny_number", "type": "short", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "float_value", "type": "float", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "double_value", "type": "double", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "decimal_value", "type": "decimal(10,2)", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "event_date", "type": "date", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "user_name", "type": "varchar(300)", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "code", "type": "varchar(15)", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "description", "type": "string", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "event_json", "type": "string", "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "map_field", "type": { "type": "external", "catalogString": "jsonb" }, "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } }, { "name": "list_field", "type": { "type": "list", "containsNull": false, "elementType": "string" }, "nullable": true, "autoIncrement": false, "defaultValue": { "type": "literal", "dataType": "null", "value": "NULL" } } ], "properties": { }, "audit": { "lastModifier": "anonymous", "lastModifiedTime": "2026-01-26T09:11:59.357512917Z" }, "distribution": { "strategy": "none", "number": 0, "funcArgs": [] }, "sortOrders": [], "partitioning": [], "indexes": [ { "indexType": "PRIMARY_KEY", "name": "all_type_pk", "fieldNames": [ [ "id" ] ] }, { "indexType": "UNIQUE_KEY", "name": "all_type_big_number_idx", "fieldNames": [ [ "big_number" ] ] } ] } } ================================================ FILE: seatunnel-api/src/test/resources/conf/option-test.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # env { parallelism = 1 job.mode = "STREAMING" checkpoint.interval = 5000 } source { FakeSource { option { bool = true bool-str = "false" int = 2147483647 int-str = "100" float = 3.3333 float-str = "3.1415" double = 3.1415926535897932384626433832795028841971 double-str = "3.1415926535897932384626433832795028841971" map { inner { path = "mac" name = "ashulin" # The nested Map(Map>) type supports only JSON map = """{"fantasy":"final"}""" } type = "source" patch.note = "hollow" name = "saitou" } } option.long = 21474836470 option.long-str = "21474836470" option.string = "Hello, Apache SeaTunnel" option.enum = "LATEST" option.numeric-list = [ 1, 2 ] option.enum-list = [ "EARLIEST", "LATEST" ] option.list-json = """["Hello", "Apache SeaTunnel"]""" option.list = ["final", "fantasy", "VII"] option.list-str = "Silk,Song" option.complex-type = [{ inner { list = [{ inner { path = "mac" name = "ashulin" map = """{"fantasy":"final"}""" } type = "source" patch.note = "hollow" name = "saitou" }, { inner { path = "mac" name = "ashulin" map = """{"fantasy":"final"}""" } type = "source" patch.note = "hollow" name = "saitou" }] list-2 = [{ inner { path = "mac" name = "ashulin" map = """{"fantasy":"final"}""" } type = "source" patch.note = "hollow" name = "saitou" }] } }] } } transform { sql { sql = "select name,age from dual" } } sink { File { path = "file:///tmp/hive/warehouse/test2" field_delimiter = "\t" row_delimiter = "\n" partition_by = ["age"] partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" file_format_type = "text" sink_columns = ["name","age"] extendsSQL = """insert into sink (c_bit_1, c_bit_8, c_bit_16, c_bit_32, c_bit_64, c_boolean, c_tinyint, c_tinyint_unsigned, c_smallint, c_smallint_unsigned, c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned, c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned, c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date, c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary, c_binary, c_year, c_int_unsigned, c_integer_unsigned,c_bigint_30,c_decimal_unsigned_30,c_decimal_30) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""" } } ================================================ FILE: seatunnel-api/src/test/resources/conf/partition_keys.schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { table = "db.test_table" partition_keys = ["bucket(id, 16)", "dt"] fields { id = int dt = string } } ================================================ FILE: seatunnel-api/src/test/resources/conf/simple.schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { fields { map = "map" array = "array" string = string boolean = boolean tinyint = tinyint smallint = smallint int = int bigint = bigint float = float double = double decimal = "decimal(30, 8)" null = "null" bytes = bytes date = date time = time timestamp = timestamp } } ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/multiple_tables_fields.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # tables_configs = [ { schema { table = "db.table1" columns = [ { name = id type = bigint nullable = false columnLength = 20 defaultValue = 0 comment = "primary key id" } ] } }, { schema { table = "db.table2" fields { user_id = int email = string age = int } } } ] ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/multiple_tables_mixed.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # tables_configs = [ { schema { table = "db.table1" fields { id = int name = string } } }, { schema { schema_url = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table2" } } ] ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/multiple_tables_no_schema_mixed_format.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Multiple tables configuration without schema fields, mixed file formats (parquet, orc, binary) tables_configs = [ { schema { table = "db.parquet_table" } file_format_type = "parquet" file_path = "/tmp/test/table1.parquet" }, { schema { table = "db.orc_table" } file_format_type = "orc" file_path = "/tmp/test/table2.orc" }, { schema { table = "db.binary_table" } file_format_type = "binary" file_path = "/tmp/test/table3.bin" } ] ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/multiple_tables_schema_url.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # tables_configs = [ { schema { table = "test_database.test_schema.test_table1" schema_url = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table1" } }, { schema { schema_url = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/table2" } } ] ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/single_no_schema.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Single table configuration without schema # When no schema is configured, should return a simple text table file_format_type = "parquet" file_path = "/tmp/test/table1.parquet" ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/single_schema_field.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { fields { id = int name = string age = int } } ================================================ FILE: seatunnel-api/src/test/resources/conf/table_schema_discoverer/single_schema_url.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # schema { schema_url = "http://localhost:8090/api/metalakes/test_catalog/schemas/test_schema/tables/test_table" } ================================================ FILE: seatunnel-ci-tools/pom.xml ================================================ 4.0.0 org.apache.seatunnel seatunnel ${revision} seatunnel-ci-tools SeaTunnel : Tools : CI : Java 3.26.1 com.github.javaparser javaparser-core ${javaparser.version} test com.github.javaparser javaparser-symbol-solver-core ${javaparser.version} test org.apache.maven.plugins maven-compiler-plugin 8 8 ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/ChineseCharacterCheckTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.github.javaparser.JavaParser; import com.github.javaparser.ParseResult; import com.github.javaparser.ast.CompilationUnit; import com.github.javaparser.ast.comments.Comment; import com.github.javaparser.ast.visitor.VoidVisitorAdapter; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.stream.Stream; import static org.apache.seatunnel.api.ImportClassCheckTest.isWindows; @Slf4j public class ChineseCharacterCheckTest { private final JavaParser JAVA_PARSER = new JavaParser(); private static final Pattern CHINESE_PATTERN = Pattern.compile("[\\u4e00-\\u9fa5]"); /** Defines what content should be checked for Chinese characters */ public enum CheckScope { /** Check both comments and code */ ALL, /** Check only comments */ COMMENTS_ONLY, /** Check only code (string literals) */ CODE_ONLY } @Disabled("Currently only checking comments") @Test public void checkChineseCharactersInAll() { checkChineseCharacters(CheckScope.ALL); } @Test public void checkChineseCharactersInCommentsOnly() { checkChineseCharacters(CheckScope.COMMENTS_ONLY); } @Disabled("Currently only checking comments") @Test public void checkChineseCharactersInCodeOnly() { checkChineseCharacters(CheckScope.CODE_ONLY); } private void checkChineseCharacters(CheckScope scope) { // Define path fragments for source and test Java files String mainPathFragment = isWindows ? "src\\main\\java" : "src/main/java"; String testPathFragment2 = isWindows ? "src\\test\\java" : "src/test/java"; try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { List filesWithChinese = new ArrayList<>(); // Filter Java files in the specified directories paths.filter( path -> { String pathString = path.toString(); return pathString.endsWith(".java") && (pathString.contains(mainPathFragment) || pathString.contains(testPathFragment2)); }) .forEach( path -> { try { // Parse the Java file ParseResult parseResult = JAVA_PARSER.parse(Files.newInputStream(path)); parseResult .getResult() .ifPresent( cu -> { // Check for Chinese characters in comments // if needed if (scope != CheckScope.CODE_ONLY) { List comments = cu.getAllContainedComments(); for (Comment comment : comments) { if (CHINESE_PATTERN .matcher( comment .getContent()) .find()) { filesWithChinese.add( String.format( "Found Chinese characters in comment at %s: %s", path .toAbsolutePath(), comment.getContent() .trim())); } } } // Check for Chinese characters in code if // needed if (scope != CheckScope.COMMENTS_ONLY) { ChineseCharacterVisitor visitor = new ChineseCharacterVisitor( path, filesWithChinese); visitor.visit(cu, null); } }); } catch (Exception e) { log.error("Error parsing file: {}", path, e); } }); // Assert that no files contain Chinese characters Assertions.assertEquals( 0, filesWithChinese.size(), () -> String.format( "Found Chinese characters in following files (Scope: %s):\n%s", scope, String.join("\n", filesWithChinese))); } catch (IOException e) { throw new RuntimeException(e); } } private static class ChineseCharacterVisitor extends VoidVisitorAdapter { private final Path filePath; private final List filesWithChinese; public ChineseCharacterVisitor(Path filePath, List filesWithChinese) { this.filePath = filePath; this.filesWithChinese = filesWithChinese; } @Override public void visit(CompilationUnit cu, Void arg) { // Check for Chinese characters in string literals cu.findAll(com.github.javaparser.ast.expr.StringLiteralExpr.class) .forEach( str -> { if (CHINESE_PATTERN.matcher(str.getValue()).find()) { filesWithChinese.add( String.format( "Found Chinese characters in string literal at %s: %s", filePath.toAbsolutePath(), str.getValue())); } }); super.visit(cu, arg); } } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/ConnectorOptionCheckTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import com.github.javaparser.JavaParser; import com.github.javaparser.ParseResult; import com.github.javaparser.ast.CompilationUnit; import com.github.javaparser.ast.NodeList; import com.github.javaparser.ast.body.ClassOrInterfaceDeclaration; import com.github.javaparser.ast.type.ClassOrInterfaceType; import lombok.extern.slf4j.Slf4j; import java.io.File; import java.io.IOException; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @Slf4j public class ConnectorOptionCheckTest { private static final String javaPathFragment = "src" + File.separator + "main" + File.separator + "java"; private static final String JAVA_FILE_EXTENSION = ".java"; private static final String CONNECTOR_DIR = "seatunnel-connectors-v2"; private static final JavaParser JAVA_PARSER = new JavaParser(); @Test public void checkConnectorOptionExist() { Set connectorOptionFileNames = new HashSet<>(); try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { List connectorClassPaths = paths.filter( path -> { String pathString = path.toString(); return pathString.endsWith(JAVA_FILE_EXTENSION) && pathString.contains(CONNECTOR_DIR) && pathString.contains(javaPathFragment); }) .collect(Collectors.toList()); connectorClassPaths.forEach( path -> { try { ParseResult parseResult = JAVA_PARSER.parse(Files.newInputStream(path)); parseResult .getResult() .ifPresent( compilationUnit -> { List classes = compilationUnit.findAll( ClassOrInterfaceDeclaration.class); for (ClassOrInterfaceDeclaration classDeclaration : classes) { if (classDeclaration.isAbstract() || classDeclaration.isInterface()) { continue; } NodeList implementedTypes = classDeclaration .getImplementedTypes(); implementedTypes.forEach( implementedType -> { if (implementedType .getNameAsString() .equals( "SeaTunnelSource") || implementedType .getNameAsString() .equals( "SeaTunnelSink")) { connectorOptionFileNames.add( path.getFileName() .toString() .replace( JAVA_FILE_EXTENSION, "") .concat( "Options")); } }); NodeList extendedTypes = classDeclaration.getExtendedTypes(); extendedTypes.forEach( extendedType -> { if (extendedType .getNameAsString() .equals( "AbstractSimpleSink") || extendedType .getNameAsString() .equals( "AbstractSingleSplitSource") || extendedType .getNameAsString() .equals( "IncrementalSource") || extendedType .getNameAsString() .equals( "BaseMultipleTableFileSink") || extendedType .getNameAsString() .equals( "BaseFileSource") || extendedType .getNameAsString() .equals( "BaseFileSink") || extendedType .getNameAsString() .equals( "HttpSource") || extendedType .getNameAsString() .equals( "HttpSink")) { connectorOptionFileNames.add( path.getFileName() .toString() .replace( JAVA_FILE_EXTENSION, "") .concat( "Options")); } }); } }); } catch (IOException e) { throw new RuntimeException(e); } }); connectorClassPaths.forEach( path -> { String className = path.getFileName().toString().replace(JAVA_FILE_EXTENSION, ""); connectorOptionFileNames.remove(className); }); Assertions.assertEquals( 0, connectorOptionFileNames.size(), () -> "Connector class does not have correspondingly [Options] class. " + "The connector need put all parameter into Options classes, like [ActivemqSink] and [ActivemqSinkOptions].\n" + "Those [Options] class are missing: \n" + String.join("\n", connectorOptionFileNames) + "\n"); } catch (IOException e) { throw new RuntimeException(e); } } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/ImportClassCheckTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import com.github.javaparser.JavaParser; import com.github.javaparser.ParseResult; import com.github.javaparser.Range; import com.github.javaparser.ast.CompilationUnit; import com.github.javaparser.ast.ImportDeclaration; import com.github.javaparser.ast.NodeList; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.io.InputStream; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.nio.file.StandardOpenOption.READ; @Slf4j public class ImportClassCheckTest { private static Map> importsMap = new HashMap<>(); private final String SEATUNNEL_SHADE_PREFIX = "org.apache.seatunnel.shade."; public static final boolean isWindows = System.getProperty("os.name").toLowerCase().startsWith("win"); private static final String JAVA_FILE_EXTENSION = ".java"; private static final JavaParser JAVA_PARSER = new JavaParser(); @BeforeAll public static void beforeAll() { try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { paths.filter(path -> path.toString().endsWith(JAVA_FILE_EXTENSION)) .forEach( path -> { try (InputStream inputStream = Files.newInputStream(path, READ)) { ParseResult parseResult = JAVA_PARSER.parse(inputStream); Optional result = parseResult.getResult(); if (result.isPresent()) { importsMap.put(path.toString(), result.get().getImports()); } else { log.error("Failed to parse Java file: " + path); } } catch (IOException e) { log.error( "IOException occurred while processing file: " + path, e); } }); } catch (IOException e) { throw new RuntimeException("Failed to walk through directory", e); } } @Test public void commonLang2Check() { // both common-lang and common-lang3 share the same prefix org.apache.commons.lang Map> commonLangMap = checkImportClassPrefix( Arrays.asList("org.apache.commons.lang"), Collections.emptyList(), Collections.emptyList()); // common-lang3 Map> commonLang3Map = checkImportClassPrefix( Arrays.asList("org.apache.commons.lang3"), Collections.emptyList(), Collections.emptyList()); // find the one in common-lang but not common-lang3 Map> errorMap = commonLangMap.entrySet().stream() .filter(entry -> !commonLang3Map.containsKey(entry.getKey())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Assertions.assertEquals( 0, errorMap.size(), shadeErrorMsg("org.apache.commons.lang", errorMap)); log.info("check org.apache.commons.lang successfully"); } @Test public void guavaShadeCheck() { Map> errorMap = checkImportClassPrefixWithAll(Collections.singletonList("com.google.common")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("guava", errorMap)); log.info("check guava shade successfully"); } @Test public void jacksonShadeCheck() { Map> errorMap = checkImportClassPrefixWithExclude( Collections.singletonList("com.fasterxml.jackson"), Arrays.asList( "org.apache.seatunnel.format.compatible.debezium.json", "org.apache.seatunnel.format.compatible.kafka.connect.json", "org.apache.seatunnel.connectors.druid.sink", "org.apache.seatunnel.connectors.seatunnel.typesense.client")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("jackson", errorMap)); log.info("check jackson shade successfully"); } @Test public void jettyShadeCheck() { Map> errorMap = checkImportClassPrefixWithAll(Collections.singletonList("org.eclipse.jetty")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("jetty", errorMap)); log.info("check jetty shade successfully"); } @Test public void hikariShadeCheck() { Map> errorMap = checkImportClassPrefixWithAll(Collections.singletonList("com.zaxxer.hikari")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("hikari", errorMap)); log.info("check hikari shade successfully"); } @Test public void janinoShadeCheck() { Map> errorMap = checkImportClassPrefixWithAll( Arrays.asList("org.codehaus.janino", "org.codehaus.commons")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("janino", errorMap)); log.info("check janino shade successfully"); } @Test public void commonLang3Check() { Map> errorMap = checkImportClassPrefixWithAll( Collections.singletonList("org.apache.commons.lang3")); Assertions.assertEquals(0, errorMap.size(), shadeErrorMsg("commons.lang3", errorMap)); log.info("check common lang3 shade successfully"); } @Test public void javaUtilCompletableFutureCheck() { Map> errorMap = checkImportClassPrefix( Collections.singletonList("java.util.concurrent.CompletableFuture"), Collections.singletonList("org.apache.seatunnel.engine"), Collections.singletonList("org.apache.seatunnel.engine.e2e")); Assertions.assertEquals( 0, errorMap.size(), errorMsg( "Can not use java.util.concurrent.CompletableFuture, please use org.apache.seatunnel.engine.common.utils.concurrent.CompletableFuture instead.", errorMap)); log.info("check java concurrent CompletableFuture successfully"); } private Map> checkImportClassPrefixWithAll(List prefixList) { return checkImportClassPrefix(prefixList, Collections.emptyList(), Collections.emptyList()); } private Map> checkImportClassPrefixWithExclude( List prefixList, List packageWhiteList) { return checkImportClassPrefix(prefixList, Collections.emptyList(), packageWhiteList); } private Map> checkImportClassPrefixWithInclude( List prefixList, List packageCheckList) { return checkImportClassPrefix(prefixList, packageCheckList, Collections.emptyList()); } private Map> checkImportClassPrefix( List prefixList, List packageCheckList, List packageWhiteList) { List pathWhiteList = packageWhiteList.stream() .map(whitePackage -> whitePackage.replace(".", isWindows ? "\\" : "/")) .collect(Collectors.toList()); List pathCheckList = packageCheckList.stream() .map(whitePackage -> whitePackage.replace(".", isWindows ? "\\" : "/")) .collect(Collectors.toList()); Map> errorMap = new HashMap<>(); importsMap.forEach( (clazzPath, imports) -> { boolean match; if (pathCheckList.isEmpty()) { match = pathWhiteList.stream().noneMatch(clazzPath::contains); } else { match = pathCheckList.stream().anyMatch(clazzPath::contains) && pathWhiteList.stream().noneMatch(clazzPath::contains); } if (match) { List collect = imports.stream() .filter( importDeclaration -> { String importClz = importDeclaration.getName().asString(); return prefixList.stream() .anyMatch(importClz::startsWith); }) .map(this::getImportClassLineNum) .collect(Collectors.toList()); if (!collect.isEmpty()) { errorMap.put(clazzPath, collect); } } }); return errorMap; } private String shadeErrorMsg(String checkType, Map> errorMap) { String msg = String.format("%s shade is not up to code, need add prefix [", checkType) + SEATUNNEL_SHADE_PREFIX + "]. \n"; return errorMsg(msg, errorMap); } private String errorMsg(String message, Map> errorMap) { StringBuilder msg = new StringBuilder(); msg.append(message).append("\n"); errorMap.forEach( (key, value) -> { msg.append(key).append("\n"); value.forEach(lineNum -> msg.append(lineNum).append("\n")); }); return msg.toString(); } private String getImportClassLineNum(ImportDeclaration importDeclaration) { Range range = importDeclaration.getRange().get(); return String.format("%s [%s]", importDeclaration.getName().asString(), range.end.line); } @AfterAll public static void cleanup() { importsMap.clear(); } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/SerialVersionUIDCheckerTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtensionContext; import org.junit.jupiter.api.extension.TestWatcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.github.javaparser.JavaParser; import com.github.javaparser.ParseResult; import com.github.javaparser.ast.CompilationUnit; import com.github.javaparser.ast.body.ClassOrInterfaceDeclaration; import com.github.javaparser.ast.type.ClassOrInterfaceType; import com.github.javaparser.ast.type.Type; import com.github.javaparser.resolution.declarations.ResolvedReferenceTypeDeclaration; import com.github.javaparser.resolution.types.ResolvedReferenceType; import com.github.javaparser.symbolsolver.JavaSymbolSolver; import com.github.javaparser.symbolsolver.resolution.typesolvers.CombinedTypeSolver; import com.github.javaparser.symbolsolver.resolution.typesolvers.JavaParserTypeSolver; import com.github.javaparser.symbolsolver.resolution.typesolvers.ReflectionTypeSolver; import java.io.File; import java.io.IOException; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.fail; @ExtendWith(SerialVersionUIDCheckerTest.TestResultLogger.class) public class SerialVersionUIDCheckerTest { private static final Logger LOG = LoggerFactory.getLogger(SerialVersionUIDCheckerTest.class); private static final String JAVA_FILE_EXTENSION = ".java"; private static final String CONNECTOR_DIR = "seatunnel-connectors-v2"; private static final String JAVA_PATH_FRAGMENT = "src" + File.separator + "main" + File.separator + "java"; private static final JavaParser JAVA_PARSER; private static final Set checkedClasses = new HashSet<>(); private static final Map classDeclarationMap = new HashMap<>(); static { CombinedTypeSolver typeSolver = new CombinedTypeSolver(); typeSolver.add(new ReflectionTypeSolver()); setupTypeSolver(typeSolver); JavaSymbolSolver symbolSolver = new JavaSymbolSolver(typeSolver); JAVA_PARSER = new JavaParser(); JAVA_PARSER.getParserConfiguration().setSymbolResolver(symbolSolver); } private static void setupTypeSolver(CombinedTypeSolver typeSolver) { try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { paths.filter(path -> path.toString().contains("src/main/java")) .forEach( path -> { try { typeSolver.add(new JavaParserTypeSolver(path.toFile())); } catch (Exception e) { // ignore } }); } catch (IOException e) { LOG.error("Failed to setup type solver", e); } } @Test public void checkSerialVersionUID() { List missingSerialVersionUID = new ArrayList<>(); List connectorClassPaths = findConnectorClassPaths(); LOG.info("Found {} connector class files to check", connectorClassPaths.size()); // First, populate the classDeclarationMap with all classes for (Path path : connectorClassPaths) { populateClassDeclarationMap(path); } LOG.info("Populated class declaration map with {} classes", classDeclarationMap.size()); // Then check each class path for serialVersionUID for (Path path : connectorClassPaths) { checkClassPath(path, missingSerialVersionUID); } LOG.info("Check completed. Checked {} connector classes.", connectorClassPaths.size()); if (!missingSerialVersionUID.isEmpty()) { String errorMessage = generateErrorMessage(missingSerialVersionUID); LOG.error("Test failed: {}", errorMessage); fail(errorMessage); } LOG.info("All checked classes have correct serialVersionUID."); } private List findConnectorClassPaths() { try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { return paths.filter( path -> { String pathString = path.toString(); return pathString.endsWith(JAVA_FILE_EXTENSION) && pathString.contains(CONNECTOR_DIR) && pathString.contains(JAVA_PATH_FRAGMENT); }) .collect(Collectors.toList()); } catch (IOException e) { throw new RuntimeException("Failed to walk through connector directories", e); } } /** Populate the classDeclarationMap with all class declarations from the given path. */ private void populateClassDeclarationMap(Path path) { try { ParseResult parseResult = JAVA_PARSER.parse(Files.newInputStream(path)); parseResult .getResult() .ifPresent( compilationUnit -> { List classes = compilationUnit.findAll(ClassOrInterfaceDeclaration.class); for (ClassOrInterfaceDeclaration classDeclaration : classes) { String className = classDeclaration.getFullyQualifiedName().orElse(""); if (!className.isEmpty()) { classDeclarationMap.put(className, classDeclaration); } } }); } catch (IOException e) { LOG.warn("Could not parse file: {}", path, e); } } /** * Check the class path for classes that implement SeaTunnelSource or SeaTunnelSink and verify * they have serialVersionUID. */ private void checkClassPath(Path path, List missingSerialVersionUID) { try { ParseResult parseResult = JAVA_PARSER.parse(Files.newInputStream(path)); parseResult .getResult() .ifPresent( compilationUnit -> { List classes = compilationUnit.findAll(ClassOrInterfaceDeclaration.class); for (ClassOrInterfaceDeclaration classDeclaration : classes) { if (implementsSeaTunnelSourceOrSink(classDeclaration)) { checkImplementedTypes( classDeclaration, missingSerialVersionUID); } } }); } catch (IOException e) { LOG.warn("Could not parse file: {}", path, e); } } private boolean implementsSeaTunnelSourceOrSink(ClassOrInterfaceDeclaration classDeclaration) { return classDeclaration.getImplementedTypes().stream() .anyMatch( type -> { String typeName = type.getNameAsString(); return typeName.equals("SeaTunnelSource") || typeName.equals("SeaTunnelSink"); }); } private void checkImplementedTypes( ClassOrInterfaceDeclaration classDeclaration, List missingSerialVersionUID) { classDeclaration .getImplementedTypes() .forEach( implementedType -> { implementedType .getTypeArguments() .ifPresent( typeArgs -> { for (Type typeArg : typeArgs) { if (typeArg.isClassOrInterfaceType()) { checkClassType( typeArg.asClassOrInterfaceType(), missingSerialVersionUID); } } }); }); } private void checkClassType( ClassOrInterfaceType classType, List missingSerialVersionUID) { try { ResolvedReferenceType resolvedType = classType.resolve().asReferenceType(); if (resolvedType == null) { return; } if (isSerializable(resolvedType)) { ResolvedReferenceTypeDeclaration typeDeclaration = resolvedType.getTypeDeclaration().orElse(null); if (typeDeclaration == null) { return; } String paramTypeName = typeDeclaration.getQualifiedName(); if (!checkedClasses.contains(paramTypeName)) { // Check if the class is abstract and return early if it is if (isAbstractClass(typeDeclaration)) { checkedClasses.add(paramTypeName); return; } if (!hasSerialVersionUID(typeDeclaration)) { missingSerialVersionUID.add(paramTypeName); LOG.warn("Class {} is missing serialVersionUID field", paramTypeName); } checkedClasses.add(paramTypeName); } } } catch (Exception e) { LOG.warn("Could not resolve type: {} in file: {}", classType.getNameAsString(), e); } } private boolean isSerializable(ResolvedReferenceType resolvedType) { return resolvedType.getQualifiedName().equals("java.io.Serializable") || resolvedType.getAllAncestors().stream() .anyMatch( ancestor -> ancestor.getQualifiedName().equals("java.io.Serializable")); } private boolean hasSerialVersionUID(ResolvedReferenceTypeDeclaration typeDeclaration) { return typeDeclaration.isInterface() || typeDeclaration.getAllFields().stream() .anyMatch(field -> field.getName().equals("serialVersionUID")); } private boolean isAbstractClass(ResolvedReferenceTypeDeclaration typeDeclaration) { // Only check classes, not interfaces if (!typeDeclaration.isClass()) { return false; } String className = typeDeclaration.getQualifiedName(); // First check if we have the class declaration in our map ClassOrInterfaceDeclaration classDeclaration = classDeclarationMap.get(className); if (classDeclaration != null) { // Directly check if the class is abstract using the declaration return classDeclaration.isAbstract(); } return false; } private String generateErrorMessage(List missingSerialVersionUID) { StringBuilder errorMessage = new StringBuilder(); errorMessage.append("=================================================================\n"); errorMessage.append( "Test failed: The following classes are missing serialVersionUID fields\n"); errorMessage.append("=================================================================\n"); errorMessage .append("A total of ") .append(missingSerialVersionUID.size()) .append(" Question:\n\n"); for (int i = 0; i < missingSerialVersionUID.size(); i++) { errorMessage .append(i + 1) .append(". ") .append(missingSerialVersionUID.get(i)) .append("\n"); } errorMessage.append( "\n=================================================================\n"); errorMessage.append( "Please add a serialVersionUID field to the above class and make sure its value is not -1L, for example:\n"); errorMessage.append("private static final long serialVersionUID = 5967888460683065669L;\n"); errorMessage.append("=================================================================\n"); return errorMessage.toString(); } public static class TestResultLogger implements TestWatcher { @Override public void testSuccessful(ExtensionContext context) { LOG.info("Test successful: {}", context.getDisplayName()); } @Override public void testFailed(ExtensionContext context, Throwable cause) { LOG.error("Test failed: {}", context.getDisplayName(), cause); } } @AfterAll public static void cleanup() { checkedClasses.clear(); classDeclarationMap.clear(); } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/SpotlessImportReplacementTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvSource; import lombok.extern.slf4j.Slf4j; import java.util.regex.Pattern; @Slf4j public class SpotlessImportReplacementTest { // Regex patterns from pom.xml spotless configuration private static final String GUAVA_REGEX = "import\\s+(static\\s+)?com\\.google\\.common\\.([^;]+);(\\r\\n|\\r|\\n)"; private static final String GUAVA_REPLACEMENT = "import $1org.apache.seatunnel.shade.com.google.common.$2;$3"; private static final String JETTY_REGEX = "import\\s+(static\\s+)?org\\.eclipse\\.jetty\\.([^;]+);(\\r\\n|\\r|\\n)"; private static final String JETTY_REPLACEMENT = "import $1org.apache.seatunnel.shade.org.eclipse.jetty.$2;$3"; private static final String HIKARI_REGEX = "import\\s+(static\\s+)?com\\.zaxxer\\.hikari\\.([^;]+);(\\r\\n|\\r|\\n)"; private static final String HIKARI_REPLACEMENT = "import $1org.apache.seatunnel.shade.com.zaxxer.hikari.$2;$3"; private static final String JANINO_REGEX = "import\\s+(static\\s+)?org\\.codehaus\\.(janino|commons)\\.([^;]+);(\\r\\n|\\r|\\n)"; private static final String JANINO_REPLACEMENT = "import $1org.apache.seatunnel.shade.org.codehaus.$2.$3;$4"; @Test public void testGuavaImportReplacement() { Pattern pattern = Pattern.compile(GUAVA_REGEX); // Test regular import String input = "import com.google.common.collect.Lists;\n"; String expected = "import org.apache.seatunnel.shade.com.google.common.collect.Lists;\n"; String result = pattern.matcher(input).replaceAll(GUAVA_REPLACEMENT); Assertions.assertEquals(expected, result); // Test static import String staticInput = "import static com.google.common.base.Preconditions.checkNotNull;\n"; String staticExpected = "import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull;\n"; String staticResult = pattern.matcher(staticInput).replaceAll(GUAVA_REPLACEMENT); Assertions.assertEquals(staticExpected, staticResult); log.info("Guava import replacement test passed"); } @Test public void testJettyImportReplacement() { Pattern pattern = Pattern.compile(JETTY_REGEX); // Test regular import String input = "import org.eclipse.jetty.server.Server;\n"; String expected = "import org.apache.seatunnel.shade.org.eclipse.jetty.server.Server;\n"; String result = pattern.matcher(input).replaceAll(JETTY_REPLACEMENT); Assertions.assertEquals(expected, result); // Test static import String staticInput = "import static org.eclipse.jetty.http.HttpStatus.OK_200;\n"; String staticExpected = "import static org.apache.seatunnel.shade.org.eclipse.jetty.http.HttpStatus.OK_200;\n"; String staticResult = pattern.matcher(staticInput).replaceAll(JETTY_REPLACEMENT); Assertions.assertEquals(staticExpected, staticResult); log.info("Jetty import replacement test passed"); } @Test public void testHikariImportReplacement() { Pattern pattern = Pattern.compile(HIKARI_REGEX); // Test regular import String input = "import com.zaxxer.hikari.HikariDataSource;\n"; String expected = "import org.apache.seatunnel.shade.com.zaxxer.hikari.HikariDataSource;\n"; String result = pattern.matcher(input).replaceAll(HIKARI_REPLACEMENT); Assertions.assertEquals(expected, result); // Test static import String staticInput = "import static com.zaxxer.hikari.HikariConfig.MINIMUM_IDLE;\n"; String staticExpected = "import static org.apache.seatunnel.shade.com.zaxxer.hikari.HikariConfig.MINIMUM_IDLE;\n"; String staticResult = pattern.matcher(staticInput).replaceAll(HIKARI_REPLACEMENT); Assertions.assertEquals(staticExpected, staticResult); log.info("Hikari import replacement test passed"); } @Test public void testJaninoImportReplacement() { Pattern pattern = Pattern.compile(JANINO_REGEX); // Test janino import String janinoInput = "import org.codehaus.janino.ExpressionEvaluator;\n"; String janinoExpected = "import org.apache.seatunnel.shade.org.codehaus.janino.ExpressionEvaluator;\n"; String janinoResult = pattern.matcher(janinoInput).replaceAll(JANINO_REPLACEMENT); Assertions.assertEquals(janinoExpected, janinoResult); // Test commons import String commonsInput = "import org.codehaus.commons.compiler.CompileException;\n"; String commonsExpected = "import org.apache.seatunnel.shade.org.codehaus.commons.compiler.CompileException;\n"; String commonsResult = pattern.matcher(commonsInput).replaceAll(JANINO_REPLACEMENT); Assertions.assertEquals(commonsExpected, commonsResult); // Test static janino import String staticInput = "import static org.codehaus.janino.Scanner.KEYWORD;\n"; String staticExpected = "import static org.apache.seatunnel.shade.org.codehaus.janino.Scanner.KEYWORD;\n"; String staticResult = pattern.matcher(staticInput).replaceAll(JANINO_REPLACEMENT); Assertions.assertEquals(staticExpected, staticResult); log.info("Janino import replacement test passed"); } @ParameterizedTest @CsvSource({ "import com.google.common.collect.Lists;, import org.apache.seatunnel.shade.com.google.common.collect.Lists;", "import static com.google.common.base.Preconditions.checkNotNull;, import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull;", "import org.eclipse.jetty.server.Server;, import org.apache.seatunnel.shade.org.eclipse.jetty.server.Server;", "import static org.eclipse.jetty.http.HttpStatus.OK_200;, import static org.apache.seatunnel.shade.org.eclipse.jetty.http.HttpStatus.OK_200;", "import com.zaxxer.hikari.HikariDataSource;, import org.apache.seatunnel.shade.com.zaxxer.hikari.HikariDataSource;", "import static com.zaxxer.hikari.HikariConfig.MINIMUM_IDLE;, import static org.apache.seatunnel.shade.com.zaxxer.hikari.HikariConfig.MINIMUM_IDLE;", "import org.codehaus.janino.ExpressionEvaluator;, import org.apache.seatunnel.shade.org.codehaus.janino.ExpressionEvaluator;", "import org.codehaus.commons.compiler.CompileException;, import org.apache.seatunnel.shade.org.codehaus.commons.compiler.CompileException;" }) public void testAllImportReplacements(String input, String expected) { String result = input + "\n"; // Apply all replacement patterns result = Pattern.compile(GUAVA_REGEX).matcher(result).replaceAll(GUAVA_REPLACEMENT); result = Pattern.compile(JETTY_REGEX).matcher(result).replaceAll(JETTY_REPLACEMENT); result = Pattern.compile(HIKARI_REGEX).matcher(result).replaceAll(HIKARI_REPLACEMENT); result = Pattern.compile(JANINO_REGEX).matcher(result).replaceAll(JANINO_REPLACEMENT); // Remove trailing newline for comparison result = result.trim(); Assertions.assertEquals(expected, result); } @Test public void testNoReplacementForAlreadyShadedImports() { // Test that already shaded imports are not modified String[] shadedImports = { "import org.apache.seatunnel.shade.com.google.common.collect.Lists;", "import org.apache.seatunnel.shade.org.eclipse.jetty.server.Server;", "import org.apache.seatunnel.shade.com.zaxxer.hikari.HikariDataSource;", "import org.apache.seatunnel.shade.org.codehaus.janino.ExpressionEvaluator;" }; for (String shadedImport : shadedImports) { String input = shadedImport + "\n"; String result = input; // Apply all replacement patterns result = Pattern.compile(GUAVA_REGEX).matcher(result).replaceAll(GUAVA_REPLACEMENT); result = Pattern.compile(JETTY_REGEX).matcher(result).replaceAll(JETTY_REPLACEMENT); result = Pattern.compile(HIKARI_REGEX).matcher(result).replaceAll(HIKARI_REPLACEMENT); result = Pattern.compile(JANINO_REGEX).matcher(result).replaceAll(JANINO_REPLACEMENT); Assertions.assertEquals( input, result, "Already shaded import should not be modified: " + shadedImport); } log.info("No replacement for already shaded imports test passed"); } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/UTClassNameCheckTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import com.github.javaparser.JavaParser; import com.github.javaparser.ParseResult; import com.github.javaparser.ast.CompilationUnit; import com.github.javaparser.ast.ImportDeclaration; import com.github.javaparser.ast.NodeList; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.apache.seatunnel.api.ImportClassCheckTest.isWindows; @Slf4j public class UTClassNameCheckTest { private final JavaParser JAVA_PARSER = new JavaParser(); @Test public void checkUTClassName() { String testPathFragment = isWindows ? "src\\test\\java" : "src/test/java"; try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { List collect = paths.filter( path -> { String pathString = path.toString(); return pathString.endsWith(".java") && !pathString.contains("e2e") && pathString.contains(testPathFragment); }) .map( path -> { try { ParseResult parseResult = JAVA_PARSER.parse(Files.newInputStream(path)); return parseResult .getResult() .map( compilationUnit -> { NodeList imports = compilationUnit .getImports(); return imports.stream() .anyMatch( i -> "org.junit.jupiter.api.Test" .equals( i.getName() .asString())) ? path : null; }) .orElse(null); } catch (Exception e) { log.error("Error parsing file: {}", path, e); return null; } }) .filter(Objects::nonNull) .filter( path -> { String fileName = path.getFileName().toString(); int dotIndex = fileName.lastIndexOf('.'); String className = dotIndex == -1 ? fileName : fileName.substring(0, dotIndex); return !(className.startsWith("Test") || className.endsWith("Test") || className.endsWith("Tests") || className.endsWith("TestCase")); }) .map(Path::toAbsolutePath) .map(Path::toString) .collect(Collectors.toList()); Assertions.assertEquals( 0, collect.size(), () -> "UT class does not conform to the naming convention, " + "must should be start with 'Test' or end with 'Test' " + "or end with 'Tests' or end with 'TestCase'.\n " + String.join("\n", collect)); } catch (IOException e) { throw new RuntimeException(e); } } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/file/AllFileSpecificationCheckTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.file; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.DisabledOnOs; import org.junit.jupiter.api.condition.OS; import lombok.extern.slf4j.Slf4j; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.stream.Stream; @Slf4j @DisabledOnOs(OS.WINDOWS) public class AllFileSpecificationCheckTest { private static Map> fileContents; @BeforeAll public static void beforeAll() throws IOException { List fileTypesCanNotRead = Arrays.asList( "parquet", "orc", "xlsx", "xls", "png", "jar", "lzo", "zip", "ico", "jks"); List fileCanNotRead = Arrays.asList( "seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/encoding/gbk.json", "seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/encoding/gbk.xml", "seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/encoding/gbk_use_attr_format.xml", "seatunnel-connectors-v2/connector-file/connector-file-base/src/test/resources/encoding/gbk.txt", "seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/json/e2e_gbk.json", "seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/e2e_gbk.txt"); fileContents = new LinkedHashMap<>(); try (Stream paths = Files.walk(Paths.get(".."), FileVisitOption.FOLLOW_LINKS)) { paths.filter(path -> path.toFile().isFile()) .filter(path -> !path.toFile().getName().startsWith(".")) .filter( path -> !fileTypesCanNotRead.contains( path.toFile() .getName() .substring( path.toFile().getName().lastIndexOf(".") + 1))) .filter(path -> !fileCanNotRead.contains(path.toString().substring(3))) .filter( path -> !path.toString() .contains(File.separator + "target" + File.separator)) .filter( path -> !path.toString() .contains( File.separator + "node_modules" + File.separator)) .filter( path -> !path.toString() .contains(File.separator + "node" + File.separator)) .filter(path -> !path.toString().contains(File.separator + ".")) .forEach( path -> { try { fileContents.put( path.toString().substring(3), Files.readAllLines(path, StandardCharsets.UTF_8)); } catch (IOException e) { log.error("Failed to read file: {}", path, e); throw new RuntimeException(e); } }); } } @Test public void testFileNotContainsSourceTableNameAndResultTableName() { List whiteList = Arrays.asList( "seatunnel-dist/src/test/java/org/apache/seatunnel/api/file/AllFileSpecificationCheckTest.java", "docs/zh/connectors/common-options/source-common-options.md", "docs/zh/connectors/common-options/sink-common-options.md", "docs/zh/transforms/common-options/common-options.md", "docs/zh/introduction/concepts/config.md", "docs/en/connectors/common-options/source-common-options.md", "docs/en/connectors/common-options/sink-common-options.md", "docs/en/transforms/common-options/common-options.md", "docs/en/introduction/concepts/config.md", "seatunnel-api/src/main/java/org/apache/seatunnel/api/options/ConnectorCommonOptions.java", "seatunnel-e2e/seatunnel-connector-v2-e2e/connector-fake-e2e/src/test/resources/fake_to_assert_with_compatible_source_and_result_table_name.conf", "seatunnel-e2e/seatunnel-connector-v2-e2e/connector-fake-e2e/src/test/java/org/apache/seatunnel/e2e/connector/fake/FakeIT.java", "seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/file/AllFileSpecificationCheckTest.java"); fileContents.forEach( (path, lines) -> { if (path.contains("/changelog/")) { return; } if (whiteList.contains(path.trim())) { return; } for (int i = 0; i < lines.size(); i++) { String line = lines.get(i); if (line.contains("source_table_name") || line.contains("result_table_name")) { throw new RuntimeException( String.format( "File %s Line %d [%s] contains `source_table_name` or `result_table_name`, please use `plugin_input` and `plugin_output` instead.", path, i + 1, line)); } } }); } } ================================================ FILE: seatunnel-ci-tools/src/test/java/org/apache/seatunnel/api/file/MarkdownTest.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.api.file; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.DisabledOnOs; import org.junit.jupiter.api.condition.OS; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; public class MarkdownTest { private static final List docsDirectories = new ArrayList<>(); private static final List connectorsDirectories = new ArrayList<>(); @BeforeAll public static void setup() { docsDirectories.add(Paths.get("..", "docs", "en")); docsDirectories.add(Paths.get("..", "docs", "zh")); connectorsDirectories.add(Paths.get("..", "docs", "en", "connectors", "source")); connectorsDirectories.add(Paths.get("..", "docs", "en", "connectors", "sink")); connectorsDirectories.add(Paths.get("..", "docs", "zh", "connectors", "source")); connectorsDirectories.add(Paths.get("..", "docs", "zh", "connectors", "sink")); } @Test @DisabledOnOs(OS.WINDOWS) public void testChineseDocFileNameContainsInEnglishVersionDoc() { // Verify that the file names in the English and Chinese directories are the same. List enFileName = fileName(docsDirectories.get(0)).stream() .map(path -> path.replace("/en/", "/")) .collect(Collectors.toList()); List zhFileName = fileName(docsDirectories.get(1)).stream() .map(path -> path.replace("/zh/", "/")) .collect(Collectors.toList()); // Find Chinese files that don't have English counterparts List missingEnglishFiles = zhFileName.stream() .filter(zhFile -> !enFileName.contains(zhFile)) .collect(Collectors.toList()); // If there are files missing English versions, throw an exception if (!missingEnglishFiles.isEmpty()) { StringBuilder errorMessage = new StringBuilder(); errorMessage.append( String.format( "Found %d Chinese files without English versions:\n", missingEnglishFiles.size())); missingEnglishFiles.forEach( file -> errorMessage.append( String.format("Missing English version for: %s\n", file))); throw new AssertionError(errorMessage.toString()); } } private List fileName(Path docDirectory) { try (Stream paths = Files.walk(docDirectory)) { return paths.filter(Files::isRegularFile) .filter(path -> path.toString().endsWith(".md")) .map(Path::toString) .collect(Collectors.toList()); } catch (IOException e) { throw new RuntimeException(e); } } @Test public void testPrimaryHeadersHaveNoTextAbove() { docsDirectories.forEach( docsDirectory -> { try (Stream paths = Files.walk(docsDirectory)) { List mdFiles = paths.filter(Files::isRegularFile) .filter(path -> !path.getParent().endsWith("changelog")) .filter(path -> path.toString().endsWith(".md")) .collect(Collectors.toList()); for (Path mdPath : mdFiles) { List lines = Files.readAllLines(mdPath, StandardCharsets.UTF_8); String firstRelevantLine = null; int lineNumber = 0; boolean inFrontMatter = false; for (int i = 0; i < lines.size(); i++) { String line = lines.get(i).trim(); lineNumber = i + 1; if (i == 0 && line.equals("---")) { inFrontMatter = true; continue; } if (inFrontMatter) { if (line.equals("---")) { inFrontMatter = false; } continue; } if (line.isEmpty()) { continue; } if (line.startsWith("import ")) { continue; } firstRelevantLine = line; break; } if (firstRelevantLine == null) { Assertions.fail( String.format( "The file %s is empty and has no content.", mdPath)); } if (!firstRelevantLine.startsWith("# ")) { Assertions.fail( String.format( "The first line of the file %s is not a first level heading. First line content: “%s” (line number: %d)", mdPath, firstRelevantLine, lineNumber)); } } } catch (IOException e) { throw new RuntimeException(e); } }); } @Test public void testAllHeaderNotEndWithSymbol() { connectorsDirectories.forEach( docsDirectory -> { try (Stream paths = Files.walk(docsDirectory)) { List mdFiles = paths.filter(Files::isRegularFile) .filter(path -> path.toString().endsWith(".md")) .collect(Collectors.toList()); for (Path mdPath : mdFiles) { List lines = Files.readAllLines(mdPath, StandardCharsets.UTF_8); for (String line : lines) { String trimmedLine = line.trim(); if (trimmedLine.startsWith("#")) { if (trimmedLine.endsWith(":") || trimmedLine.endsWith(":")) { Assertions.fail( String.format( "The header in the file %s ends with a symbol. Header content: “%s”", mdPath, trimmedLine)); } } } } } catch (IOException e) { throw new RuntimeException(e); } }); } @Test public void testConnectorDocWithChangeLogFlagAndFile() { Pattern importPattern = Pattern.compile("import ChangeLog from '../changelog/(connector-.*).md';"); connectorsDirectories.forEach( docsDirectory -> { try (Stream paths = Files.walk(docsDirectory)) { List mdFiles = paths.filter(Files::isRegularFile) .filter(path -> path.toString().endsWith(".md")) .collect(Collectors.toList()); for (Path mdPath : mdFiles) { List lines = Files.readAllLines(mdPath, StandardCharsets.UTF_8); String line = lines.get(0); Assertions.assertTrue( line.startsWith("import ChangeLog from '../changelog/"), "The first line of the file " + mdPath + " is not a change log import."); Matcher matcher = importPattern.matcher(line); Assertions.assertTrue( matcher.matches(), "The first line of the file " + mdPath + " is not a change log import."); String connector = matcher.group(1); if (docsDirectory.getParent().getParent().endsWith("en")) { Assertions.assertTrue( Files.exists( Paths.get( "..", "docs", "en", "connectors", "changelog", connector + ".md")), "The change log file for " + connector + " does not exist, please check " + mdPath); } else { Assertions.assertTrue( Files.exists( Paths.get( "..", "docs", "zh", "connectors", "changelog", connector + ".md")), "The change log file for " + connector + " does not exist, please check " + mdPath); } String file = String.join("\n", lines); Assertions.assertTrue( file.trim().endsWith(""), "The file " + mdPath + " does not end with ."); } } catch (IOException e) { throw new RuntimeException(e); } }); } } ================================================ FILE: seatunnel-common/pom.xml ================================================ 4.0.0 org.apache.seatunnel seatunnel ${revision} seatunnel-common SeaTunnel : Common org.apache.seatunnel seatunnel-config-shade org.apache.seatunnel seatunnel-commons-lang3 ${project.version} optional org.apache.commons commons-collections4 org.apache.commons commons-csv org.apache.seatunnel seatunnel-guava ${project.version} optional org.apache.seatunnel seatunnel-jackson ${project.version} optional org.apache.seatunnel seatunnel-arrow ${project.version} optional commons-codec commons-codec ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/Constants.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common; public final class Constants { public static final String LOGO = "SeaTunnel"; public static final String ENV = "env"; public static final String SOURCE = "source"; public static final String TRANSFORM = "transform"; public static final String SINK = "sink"; public static final String SOURCE_SERIALIZATION = "source.serialization"; public static final String SINK_SERIALIZATION = "sink.serialization"; public static final String HDFS_ROOT = "hdfs.root"; public static final String HDFS_USER = "hdfs.user"; public static final String CHECKPOINT_ID = "checkpoint.id"; public static final String UUID = "uuid"; public static final String NOW = "now"; public static final String ST_LOGO = " \n" + " _____ _____ _ \n" + "/ ___| |_ _| | |\n" + "\\ `--. ___ __ _ | | _ _ _ __ _ __ ___ | |\n" + " `--. \\ / _ \\ / _` | | | | | | || '_ \\ | '_ \\ / _ \\| |\n" + "/\\__/ /| __/| (_| | | | | |_| || | | || | | || __/| |\n" + "\\____/ \\___| \\__,_| \\_/ \\__,_||_| |_||_| |_| \\___||_|\n" + " \n"; public static final String COPYRIGHT_LINE = "Copyright © 2021-2024 The Apache Software Foundation. Apache SeaTunnel, SeaTunnel, and its feather logo are trademarks of The Apache Software Foundation."; private Constants() {} } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/Handover.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common; import java.io.Closeable; import java.util.Optional; import java.util.concurrent.LinkedBlockingQueue; import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public final class Handover implements Closeable { private static final int DEFAULT_QUEUE_SIZE = 10000; private final Object lock = new Object(); private final LinkedBlockingQueue blockingQueue = new LinkedBlockingQueue<>(DEFAULT_QUEUE_SIZE); private Throwable error; public boolean isEmpty() throws Exception { if (error != null) { rethrowException(error, error.getMessage()); } return blockingQueue.isEmpty(); } public Optional pollNext() throws Exception { if (error != null) { rethrowException(error, error.getMessage()); } else if (!isEmpty()) { return Optional.ofNullable(blockingQueue.poll()); } return Optional.empty(); } public void produce(final T element) throws InterruptedException, ClosedException { if (error != null) { throw new ClosedException(); } blockingQueue.put(element); } public void reportError(Throwable t) { checkNotNull(t); synchronized (lock) { // do not override the initial exception if (error == null) { error = t; } lock.notifyAll(); } } @Override public void close() { synchronized (lock) { if (error == null) { error = new ClosedException(); } lock.notifyAll(); } } public static void rethrowException(Throwable t, String parentMessage) throws Exception { if (t instanceof Error) { throw (Error) t; } else if (t instanceof Exception) { throw (Exception) t; } else { throw new Exception(parentMessage, t); } } public static final class ClosedException extends Exception { private static final long serialVersionUID = 1L; } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/CheckConfigUtil.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; import org.apache.seatunnel.shade.com.typesafe.config.Config; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.stream.Collectors; public final class CheckConfigUtil { private CheckConfigUtil() {} /** please using {@link #checkAllExists} instead, since 2.0.5 */ @Deprecated public static CheckResult check(Config config, String... params) { return checkAllExists(config, params); } public static CheckResult checkAllExists(Config config, String... params) { List missingParams = Arrays.stream(params) .filter(param -> !isValidParam(config, param)) .collect(Collectors.toList()); if (!missingParams.isEmpty()) { String errorMsg = String.format( "please specify [%s] as non-empty", String.join(",", missingParams)); return CheckResult.error(errorMsg); } else { return CheckResult.success(); } } /** check config if there was at least one usable */ public static CheckResult checkAtLeastOneExists(Config config, String... params) { if (params.length == 0) { return CheckResult.success(); } List missingParams = new LinkedList<>(); for (String param : params) { if (!isValidParam(config, param)) { missingParams.add(param); } } if (missingParams.size() == params.length) { String errorMsg = String.format( "please specify at least one config of [%s] as non-empty", String.join(",", missingParams)); return CheckResult.error(errorMsg); } else { return CheckResult.success(); } } public static boolean isValidParam(Config config, String param) { boolean isValidParam = true; if (!config.hasPath(param)) { isValidParam = false; } else if (config.getAnyRef(param) instanceof List) { isValidParam = !((List) config.getAnyRef(param)).isEmpty(); } return isValidParam; } /** merge all check result */ public static CheckResult mergeCheckResults(CheckResult... checkResults) { List notPassConfig = Arrays.stream(checkResults) .filter(item -> !item.isSuccess()) .collect(Collectors.toList()); if (notPassConfig.isEmpty()) { return CheckResult.success(); } else { String errMessage = notPassConfig.stream() .map(CheckResult::getMsg) .collect(Collectors.joining(",")); return CheckResult.error(errMessage); } } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/CheckResult.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; import lombok.Data; @Data public class CheckResult { private static final CheckResult SUCCESS = new CheckResult(true, ""); private boolean success; private String msg; private CheckResult(boolean success, String msg) { this.success = success; this.msg = msg; } /** @return a successful instance of CheckResult */ public static CheckResult success() { return SUCCESS; } /** * @param msg the error message * @return an error instance of CheckResult */ public static CheckResult error(String msg) { return new CheckResult(false, msg); } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/Common.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; import org.apache.seatunnel.shade.com.google.common.annotations.VisibleForTesting; import org.apache.seatunnel.shade.org.apache.commons.lang3.StringUtils; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; import static java.nio.file.FileVisitOption.FOLLOW_LINKS; public class Common { private static final String FLINK_YARN_APPLICATION_PATH = "runtime.tar.gz"; private Common() { throw new IllegalStateException("Utility class"); } /** Used to set the size when create a new collection(just to pass the checkstyle). */ public static final int COLLECTION_SIZE = 16; private static final int APP_LIB_DIR_DEPTH = 2; private static final int PLUGIN_LIB_DIR_DEPTH = 3; private static DeployMode MODE = DeployMode.CLIENT; private static String SEATUNNEL_HOME; private static boolean STARTER = false; /** Set mode. return false in case of failure */ public static void setDeployMode(DeployMode mode) { MODE = mode; } public static void setStarter(boolean inStarter) { STARTER = inStarter; } public static DeployMode getDeployMode() { return MODE; } public static String getSeaTunnelHome() { if (StringUtils.isNotEmpty(SEATUNNEL_HOME)) { return SEATUNNEL_HOME; } String seatunnelHome = System.getProperty("SEATUNNEL_HOME"); if (StringUtils.isBlank(seatunnelHome)) { seatunnelHome = System.getenv("SEATUNNEL_HOME"); } if (StringUtils.isBlank(seatunnelHome)) { seatunnelHome = appRootDir().toString(); } SEATUNNEL_HOME = seatunnelHome; return SEATUNNEL_HOME; } @VisibleForTesting public static void setSeaTunnelHome(String seatunnelHome) { SEATUNNEL_HOME = seatunnelHome; } /** * Root dir varies between different spark master and deploy mode, it also varies between * relative and absolute path. When running seatunnel in --master local, you can put plugins * related files in $project_dir/plugins, then these files will be automatically copied to * $project_dir/seatunnel-core/target and token in effect if you start seatunnel in IDE tools * such as IDEA. When running seatunnel in --master yarn or --master mesos, you can put plugins * related files in plugins dir. */ public static Path appRootDir() { if (DeployMode.CLIENT == MODE || DeployMode.RUN == MODE || STARTER) { try { String path = Common.class .getProtectionDomain() .getCodeSource() .getLocation() .toURI() .getPath(); path = new File(path).getPath(); return Paths.get(path).getParent().getParent(); } catch (URISyntaxException e) { throw new RuntimeException(e); } } else if (DeployMode.CLUSTER == MODE) { return Paths.get(""); } else if (DeployMode.RUN_APPLICATION == MODE) { return Paths.get(FLINK_YARN_APPLICATION_PATH); } else { throw new IllegalStateException("deploy mode not support : " + MODE); } } public static Path appStarterDir() { return appRootDir().resolve("starter"); } /** Plugin Root Dir */ public static Path pluginRootDir() { return Paths.get(getSeaTunnelHome(), "plugins"); } /** Plugin Connector Dir */ public static Path connectorDir() { return Paths.get(getSeaTunnelHome(), "connectors"); } /** lib Dir */ public static Path libDir() { return Paths.get(getSeaTunnelHome(), "lib"); } /** return lib jars, which located in 'lib/*' or 'lib/{dir}/*'. */ public static List getLibJars() { Path libRootDir = Common.libDir(); if (!Files.exists(libRootDir) || !Files.isDirectory(libRootDir)) { return Collections.emptyList(); } try (Stream stream = Files.walk(libRootDir, APP_LIB_DIR_DEPTH, FOLLOW_LINKS)) { return stream.filter(it -> !it.toFile().isDirectory()) .filter(it -> it.getFileName().toString().endsWith(".jar")) .collect(Collectors.toList()); } catch (IOException e) { throw new RuntimeException(e); } } /** return the jar package configured in env jars */ public static Set getThirdPartyJars(String paths) { return Arrays.stream(paths.split(";")) .filter(s -> !"".equals(s)) .filter(it -> it.endsWith(".jar")) .map(path -> Paths.get(URI.create(path))) .collect(Collectors.toSet()); } public static Path pluginTarball() { return appRootDir().resolve("plugins.tar.gz"); } /** return plugin's dependent jars, which located in 'plugins/${pluginName}/lib/*'. */ public static List getPluginsJarDependenciesWithoutConnectorDependency() { Path pluginRootDir = Common.pluginRootDir(); if (!Files.exists(pluginRootDir) || !Files.isDirectory(pluginRootDir)) { return Collections.emptyList(); } try (Stream stream = Files.walk(pluginRootDir, PLUGIN_LIB_DIR_DEPTH, FOLLOW_LINKS)) { return stream.filter( it -> pluginRootDir.relativize(it).getNameCount() == PLUGIN_LIB_DIR_DEPTH) .filter( it -> !it.getParent() .getParent() .getName(it.getParent().getParent().getNameCount() - 1) .startsWith("connector-")) .filter(it -> it.getParent().endsWith("lib")) .filter(it -> it.getFileName().toString().endsWith(".jar")) .collect(Collectors.toList()); } catch (IOException e) { throw new RuntimeException(e); } } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/ConfigRuntimeException.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; public class ConfigRuntimeException extends RuntimeException { public ConfigRuntimeException() { super(); } public ConfigRuntimeException(String message) { super(message); } public ConfigRuntimeException(String message, Throwable cause) { super(message, cause); } public ConfigRuntimeException(Throwable cause) { super(cause); } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/DeployMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; import java.util.Arrays; import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; public enum DeployMode { /** Spark */ CLIENT("client"), CLUSTER("cluster"), /** Flink */ RUN("run"), RUN_APPLICATION("run-application"); private final String deployMode; DeployMode(String deployMode) { this.deployMode = deployMode; } public String getDeployMode() { return deployMode; } private static final Map NAME_MAP = Arrays.stream(DeployMode.values()) .collect(Collectors.toMap(DeployMode::getDeployMode, Function.identity())); public static Optional from(String deployMode) { return Optional.ofNullable(NAME_MAP.get(deployMode.toLowerCase())); } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/config/TypesafeConfigUtils.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.config; import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.shade.com.typesafe.config.ConfigValue; import lombok.NonNull; import java.util.HashMap; import java.util.List; import java.util.Map; public final class TypesafeConfigUtils { private TypesafeConfigUtils() {} /** * Check if config with specific prefix exists * * @param source config source * @param prefix config prefix * @return true if it has sub config */ public static boolean hasSubConfig(Config source, String prefix) { boolean hasConfig = false; for (Map.Entry entry : source.entrySet()) { final String key = entry.getKey(); if (key.startsWith(prefix)) { hasConfig = true; break; } } return hasConfig; } @SuppressWarnings("unchecked") public static T getConfig( final Config config, final String configKey, final T defaultValue) { if (!config.hasPath(configKey) && defaultValue == null) { return defaultValue; } if (defaultValue.getClass().equals(Long.class)) { return config.hasPath(configKey) ? (T) Long.valueOf(config.getString(configKey)) : defaultValue; } if (defaultValue.getClass().equals(Integer.class)) { return config.hasPath(configKey) ? (T) Integer.valueOf(config.getString(configKey)) : defaultValue; } if (defaultValue.getClass().equals(String.class)) { return config.hasPath(configKey) ? (T) config.getString(configKey) : defaultValue; } if (defaultValue.getClass().equals(Boolean.class)) { return config.hasPath(configKey) ? (T) Boolean.valueOf(config.getString(configKey)) : defaultValue; } if (defaultValue instanceof Map || defaultValue instanceof List) { return config.hasPath(configKey) ? (T) config.getAnyRef(configKey) : defaultValue; } throw new RuntimeException("Unsupported config type, configKey: " + configKey); } public static List getConfigList( Config config, String configKey, @NonNull List defaultValue) { return config.hasPath(configKey) ? config.getConfigList(configKey) : defaultValue; } public static Map configToMap(Config config) { Map configMap = new HashMap<>(); config.entrySet() .forEach( entry -> { configMap.put(entry.getKey(), entry.getValue().unwrapped().toString()); }); return configMap; } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/constants/CollectionConstants.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.constants; public class CollectionConstants { public static final String PLUGIN_NAME = "plugin_name"; public static final String SEATUNNEL_PLUGIN = "seatunnel"; public static final String SOURCE_PLUGIN = "source"; public static final String TRANSFORM_PLUGIN = "transform"; public static final String SINK_PLUGIN = "sink"; } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/constants/EngineType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.constants; /** Engine type enum */ public enum EngineType { SPARK2("spark", "seatunnel-spark-2-starter.jar", "start-seatunnel-spark-2-connector-v2.sh"), SPARK3("spark", "seatunnel-spark-3-starter.jar", "start-seatunnel-spark-3-connector-v2.sh"), FLINK13("flink", "seatunnel-flink-13-starter.jar", "start-seatunnel-flink-13-connector-v2.sh"), FLINK15("flink", "seatunnel-flink-15-starter.jar", "start-seatunnel-flink-15-connector-v2.sh"), FLINK20("flink", "seatunnel-flink-20-starter.jar", "start-seatunnel-flink-20-connector-v2.sh"), SEATUNNEL("seatunnel", "seatunnel-starter.jar", "seatunnel.sh"); private final String engine; private final String starterJarName; private final String starterShellName; EngineType(String engine, String starterJarName, String starterShellName) { this.engine = engine; this.starterJarName = starterJarName; this.starterShellName = starterShellName; } public String getEngine() { return engine; } public String getStarterJarName() { return starterJarName; } public String getStarterShellName() { return starterShellName; } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/constants/JobMode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.constants; public enum JobMode { BATCH, STREAMING } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/constants/MetaLakeType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.constants; /** The type of meta lake. */ public enum MetaLakeType { GRAVITINO("gravitino"); private final String type; MetaLakeType(String type) { this.type = type; } public String getType() { return type; } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/constants/PluginType.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.constants; /** The type of SeaTunnel plugin. */ public enum PluginType { SOURCE("source"), TRANSFORM("transform"), SINK("sink"); private final String type; PluginType(String type) { this.type = type; } public String getType() { return type; } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/exception/CommonError.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.exception; import org.apache.seatunnel.shade.com.fasterxml.jackson.core.JsonProcessingException; import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.common.constants.PluginType; import org.apache.commons.collections4.map.SingletonMap; import java.util.HashMap; import java.util.Map; import static org.apache.seatunnel.common.exception.CommonErrorCode.CLOSE_FAILED; import static org.apache.seatunnel.common.exception.CommonErrorCode.CONVERT_TO_CONNECTOR_TYPE_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.CONVERT_TO_CONNECTOR_TYPE_ERROR_SIMPLE; import static org.apache.seatunnel.common.exception.CommonErrorCode.CONVERT_TO_SEATUNNEL_PROPS_BLANK_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.CONVERT_TO_SEATUNNEL_TYPE_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.CONVERT_TO_SEATUNNEL_TYPE_ERROR_SIMPLE; import static org.apache.seatunnel.common.exception.CommonErrorCode.FILE_NOT_EXISTED; import static org.apache.seatunnel.common.exception.CommonErrorCode.FILE_OPERATION_FAILED; import static org.apache.seatunnel.common.exception.CommonErrorCode.GET_CATALOG_TABLES_WITH_UNSUPPORTED_TYPE_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.GET_CATALOG_TABLE_WITH_UNSUPPORTED_TYPE_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.ILLEGAL_ARGUMENT; import static org.apache.seatunnel.common.exception.CommonErrorCode.JSON_OPERATION_FAILED; import static org.apache.seatunnel.common.exception.CommonErrorCode.OPERATION_NOT_SUPPORTED; import static org.apache.seatunnel.common.exception.CommonErrorCode.SEATUNNEL_ROW_SERIALIZE_FAILED; import static org.apache.seatunnel.common.exception.CommonErrorCode.SQL_TEMPLATE_HANDLED_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.UNSUPPORTED_ARRAY_GENERIC_TYPE; import static org.apache.seatunnel.common.exception.CommonErrorCode.UNSUPPORTED_DATA_TYPE; import static org.apache.seatunnel.common.exception.CommonErrorCode.UNSUPPORTED_ENCODING; import static org.apache.seatunnel.common.exception.CommonErrorCode.UNSUPPORTED_ROW_KIND; import static org.apache.seatunnel.common.exception.CommonErrorCode.VERSION_NOT_SUPPORTED; import static org.apache.seatunnel.common.exception.CommonErrorCode.WRITE_SEATUNNEL_ROW_ERROR; import static org.apache.seatunnel.common.exception.CommonErrorCode.WRITE_SEATUNNEL_ROW_ERROR_WITH_FIELDS_NOT_MATCH; import static org.apache.seatunnel.common.exception.CommonErrorCode.WRITE_SEATUNNEL_ROW_ERROR_WITH_SCHEMA_INCOMPATIBLE_SCHEMA; /** * The common error of SeaTunnel. This is an alternative to {@link CommonErrorCodeDeprecated} and is * used to define non-bug errors or expected errors for all connectors and engines. We need to * define a corresponding enumeration type in {@link CommonErrorCode} to determine the output error * message format and content. Then define the corresponding method in {@link CommonError} to * construct the corresponding error instance. */ public class CommonError { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static SeaTunnelRuntimeException fileOperationFailed( String identifier, String operation, String fileName, Throwable cause) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("operation", operation); params.put("fileName", fileName); return new SeaTunnelRuntimeException(FILE_OPERATION_FAILED, params, cause); } public static SeaTunnelRuntimeException fileOperationFailed( String identifier, String operation, String fileName) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("operation", operation); params.put("fileName", fileName); return new SeaTunnelRuntimeException(FILE_OPERATION_FAILED, params); } public static SeaTunnelRuntimeException fileNotExistFailed( String identifier, String operation, String fileName) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("operation", operation); params.put("fileName", fileName); return new SeaTunnelRuntimeException(FILE_NOT_EXISTED, params); } public static SeaTunnelRuntimeException writeSeaTunnelRowFailed( String connector, String row, Throwable cause) { Map params = new HashMap<>(); params.put("connector", connector); params.put("seaTunnelRow", row); return new SeaTunnelRuntimeException(WRITE_SEATUNNEL_ROW_ERROR, params, cause); } public static SeaTunnelRuntimeException unsupportedDataType( String identifier, String dataType, String field) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("dataType", dataType); params.put("field", field); return new SeaTunnelRuntimeException(UNSUPPORTED_DATA_TYPE, params); } public static SeaTunnelRuntimeException unsupportedVersion(String identifier, String version) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("version", version); return new SeaTunnelRuntimeException(VERSION_NOT_SUPPORTED, params); } public static SeaTunnelRuntimeException unsupportedEncoding(String encoding) { Map params = new SingletonMap<>("encoding", encoding); return new SeaTunnelRuntimeException(UNSUPPORTED_ENCODING, params); } public static SeaTunnelRuntimeException convertToSeaTunnelTypeError( String connector, PluginType pluginType, String dataType, String field) { Map params = new HashMap<>(); params.put("connector", connector); params.put("type", pluginType.getType()); params.put("dataType", dataType); params.put("field", field); return new SeaTunnelRuntimeException(CONVERT_TO_SEATUNNEL_TYPE_ERROR, params); } public static SeaTunnelRuntimeException convertToSeaTunnelTypeError( String identifier, String dataType, String field) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("dataType", dataType); params.put("field", field); return new SeaTunnelRuntimeException(CONVERT_TO_SEATUNNEL_TYPE_ERROR_SIMPLE, params); } public static SeaTunnelRuntimeException convertToConnectorTypeError( String connector, PluginType pluginType, String dataType, String field) { Map params = new HashMap<>(); params.put("connector", connector); params.put("type", pluginType.getType()); params.put("dataType", dataType); params.put("field", field); return new SeaTunnelRuntimeException(CONVERT_TO_CONNECTOR_TYPE_ERROR, params); } public static SeaTunnelRuntimeException convertToConnectorPropsBlankError( String connector, String props) { Map params = new HashMap<>(); params.put("connector", connector); params.put("props", props); return new SeaTunnelRuntimeException(CONVERT_TO_SEATUNNEL_PROPS_BLANK_ERROR, params); } public static SeaTunnelRuntimeException convertToConnectorTypeError( String identifier, String dataType, String field) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("dataType", dataType); params.put("field", field); return new SeaTunnelRuntimeException(CONVERT_TO_CONNECTOR_TYPE_ERROR_SIMPLE, params); } public static SeaTunnelRuntimeException getCatalogTableWithUnsupportedType( String catalogName, String tableName, Map fieldWithDataTypes) { Map params = new HashMap<>(); params.put("catalogName", catalogName); params.put("tableName", tableName); try { params.put("fieldWithDataTypes", OBJECT_MAPPER.writeValueAsString(fieldWithDataTypes)); } catch (JsonProcessingException e) { throw new RuntimeException(e); } return new SeaTunnelRuntimeException(GET_CATALOG_TABLE_WITH_UNSUPPORTED_TYPE_ERROR, params); } public static SeaTunnelRuntimeException getCatalogTablesWithUnsupportedType( String catalogName, Map> tableUnsupportedTypes) { Map params = new HashMap<>(); params.put("catalogName", catalogName); try { params.put( "tableUnsupportedTypes", OBJECT_MAPPER.writeValueAsString(tableUnsupportedTypes)); } catch (JsonProcessingException e) { throw new RuntimeException(e); } return new SeaTunnelRuntimeException( GET_CATALOG_TABLES_WITH_UNSUPPORTED_TYPE_ERROR, params); } public static SeaTunnelRuntimeException jsonOperationError(String identifier, String payload) { return jsonOperationError(identifier, payload, null); } public static SeaTunnelRuntimeException jsonOperationError( String identifier, String payload, Throwable cause) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("payload", payload); SeaTunnelErrorCode code = JSON_OPERATION_FAILED; if (cause != null) { return new SeaTunnelRuntimeException(code, params, cause); } else { return new SeaTunnelRuntimeException(code, params); } } public static SeaTunnelRuntimeException unsupportedOperation( String identifier, String operation) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("operation", operation); return new SeaTunnelRuntimeException(OPERATION_NOT_SUPPORTED, params); } public static SeaTunnelRuntimeException sqlTemplateHandledError( String tableName, String keyName, String template, String placeholder, String optionName) { Map params = new HashMap<>(); params.put("tableName", tableName); params.put("keyName", keyName); params.put("template", template); params.put("placeholder", placeholder); params.put("optionName", optionName); return new SeaTunnelRuntimeException(SQL_TEMPLATE_HANDLED_ERROR, params); } public static SeaTunnelRuntimeException unsupportedArrayGenericType( String identifier, String dataType, String fieldName) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("dataType", dataType); params.put("fieldName", fieldName); return new SeaTunnelRuntimeException(UNSUPPORTED_ARRAY_GENERIC_TYPE, params); } public static SeaTunnelRuntimeException unsupportedRowKind( String identifier, String tableId, String rowKind) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("tableId", tableId); params.put("rowKind", rowKind); return new SeaTunnelRuntimeException(UNSUPPORTED_ROW_KIND, params); } public static SeaTunnelRuntimeException writeRowErrorWithSchemaIncompatibleSchema( String connector, String sourceFieldSqlSchema, String expectedFieldSqlSchema, String sinkFieldSqlSchema) { Map params = new HashMap<>(); params.put("connector", connector); params.put("sourceFieldSqlSchema", sourceFieldSqlSchema); params.put("expectedFieldSqlSchema", expectedFieldSqlSchema); params.put("sinkFieldSqlSchema", sinkFieldSqlSchema); return new SeaTunnelRuntimeException( WRITE_SEATUNNEL_ROW_ERROR_WITH_SCHEMA_INCOMPATIBLE_SCHEMA, params); } public static SeaTunnelRuntimeException writeRowErrorWithFieldsCountNotMatch( String connector, int sourceFieldsNum, int sinkFieldsNum) { Map params = new HashMap<>(); params.put("connector", connector); params.put("sourceFieldsNum", String.valueOf(sourceFieldsNum)); params.put("sinkFieldsNum", String.valueOf(sinkFieldsNum)); return new SeaTunnelRuntimeException( WRITE_SEATUNNEL_ROW_ERROR_WITH_FIELDS_NOT_MATCH, params); } public static SeaTunnelRuntimeException formatDateTimeError(String datetime, String field) { Map params = new HashMap<>(); params.put("datetime", datetime); params.put("field", field); return new SeaTunnelRuntimeException(CommonErrorCode.FORMAT_DATETIME_ERROR, params); } public static SeaTunnelRuntimeException formatDateError(String date, String field) { Map params = new HashMap<>(); params.put("date", date); params.put("field", field); return new SeaTunnelRuntimeException(CommonErrorCode.FORMAT_DATE_ERROR, params); } public static SeaTunnelRuntimeException unsupportedMethod( String identifier, String methodName) { Map params = new HashMap<>(); params.put("identifier", identifier); params.put("methodName", methodName); return new SeaTunnelRuntimeException(CommonErrorCode.UNSUPPORTED_METHOD, params); } public static SeaTunnelRuntimeException illegalArgument(String argument, String operation) { Map params = new HashMap<>(); params.put("argument", argument); params.put("operation", operation); return new SeaTunnelRuntimeException(ILLEGAL_ARGUMENT, params); } public static SeaTunnelRuntimeException closeFailed(String identifier, Throwable cause) { Map params = new HashMap<>(); params.put("identifier", identifier); return new SeaTunnelRuntimeException(CLOSE_FAILED, params, cause); } public static SeaTunnelRuntimeException seatunnelRowSerializeFailed( String row, Throwable cause) { Map params = new HashMap<>(); params.put("row", row); return new SeaTunnelRuntimeException(SEATUNNEL_ROW_SERIALIZE_FAILED, params, cause); } } ================================================ FILE: seatunnel-common/src/main/java/org/apache/seatunnel/common/exception/CommonErrorCode.java ================================================ /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.seatunnel.common.exception; /** SeaTunnel connector error code interface */ public enum CommonErrorCode implements SeaTunnelErrorCode { FILE_OPERATION_FAILED("COMMON-01", " file '' failed."), JSON_OPERATION_FAILED( "COMMON-02", " JSON convert/parse '' operation failed."), ILLEGAL_ARGUMENT("COMMON-06", "illegal argument '' of ''"), UNSUPPORTED_DATA_TYPE( "COMMON-07", "'' unsupported data type '' of ''"), UNSUPPORTED_ENCODING("COMMON-08", "unsupported encoding ''"), CONVERT_TO_SEATUNNEL_TYPE_ERROR( "COMMON-16", "'' unsupported convert type '' of '' to SeaTunnel data type."), CONVERT_TO_SEATUNNEL_TYPE_ERROR_SIMPLE( "COMMON-17", "'' unsupported convert type '' of '' to SeaTunnel data type."), CONVERT_TO_CONNECTOR_TYPE_ERROR( "COMMON-18", "'' unsupported convert SeaTunnel data type '' of '' to connector data type."), CONVERT_TO_CONNECTOR_TYPE_ERROR_SIMPLE( "COMMON-19", "'' unsupported convert SeaTunnel data type '' of '' to connector data type."), GET_CATALOG_TABLE_WITH_UNSUPPORTED_TYPE_ERROR( "COMMON-20", "'' table '' unsupported get catalog table with field data types ''"), GET_CATALOG_TABLES_WITH_UNSUPPORTED_TYPE_ERROR( "COMMON-21", "'' tables unsupported get catalog table,the corresponding field types in the following tables are not supported: ''"), FILE_NOT_EXISTED( "COMMON-22", " file '' failed, because it not existed."), WRITE_SEATUNNEL_ROW_ERROR( "COMMON-23", " write SeaTunnelRow failed, the SeaTunnelRow value is ''."), SQL_TEMPLATE_HANDLED_ERROR( "COMMON-24", "The table of has no , but the template \n